aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMark Johnston <markj@FreeBSD.org>2023-06-02 15:58:29 +0000
committerMark Johnston <markj@FreeBSD.org>2023-06-02 16:15:01 +0000
commit9a3444d91c706dda65040138acbdb8c932213960 (patch)
tree99bf31173f2c3a5a5bb6c46696e0c2603c4704d6
parent9ad8dc721e57bdccd55532a01bde251502966a95 (diff)
downloadsrc-9a3444d91c706dda65040138acbdb8c932213960.tar.gz
src-9a3444d91c706dda65040138acbdb8c932213960.zip
ossl: Add a VAES-based AES-GCM implementation for amd64
aes-gcm-avx512.S is generated from OpenSSL 3.1 and implements AES-GCM. ossl_x86.c detects whether the CPU implements the required AVX512 instructions; if not, the ossl(4) module does not provide an AES-GCM implementation. The VAES implementation increases throughput for all buffer sizes in both directions, up to 2x for sufficiently large buffers. The "process" implementation is in two parts: a generic OCF layer in ossl_aes.c that calls a set of MD functions to do the heavy lifting. The intent there is to make it possible to add other implementations for other platforms, e.g., to reduce the diff required for D37421. A follow-up commit will add a fallback path to legacy AES-NI, so that ossl(4) can be used in preference to aesni(4) on all amd64 platforms. In the long term we would like to replace aesni(4) and armv8crypto(4) with ossl(4). Note, currently this implementation will not be selected by default since aesni(4) and ossl(4) return the same probe priority for crypto sessions, and the opencrypto framework selects the first registered implementation to break a tie. Since aesni(4) is compiled into the kernel, aesni(4) wins. A separate change may modify ossl(4) to have priority. Sponsored by: Stormshield Sponsored by: Klara, Inc. Reviewed by: jhb MFC after: 3 months Differential Revision: https://reviews.freebsd.org/D39783
-rw-r--r--sys/crypto/openssl/amd64/aes-gcm-avx512.S136132
-rw-r--r--sys/crypto/openssl/amd64/ossl_aes_gcm.c233
-rw-r--r--sys/crypto/openssl/ossl.c54
-rw-r--r--sys/crypto/openssl/ossl.h6
-rw-r--r--sys/crypto/openssl/ossl_aes.c103
-rw-r--r--sys/crypto/openssl/ossl_aes_gcm.h71
-rw-r--r--sys/crypto/openssl/ossl_x86.c25
-rw-r--r--sys/modules/ossl/Makefile2
8 files changed, 136616 insertions, 10 deletions
diff --git a/sys/crypto/openssl/amd64/aes-gcm-avx512.S b/sys/crypto/openssl/amd64/aes-gcm-avx512.S
new file mode 100644
index 000000000000..6ddd1f994704
--- /dev/null
+++ b/sys/crypto/openssl/amd64/aes-gcm-avx512.S
@@ -0,0 +1,136132 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from aes-gcm-avx512.pl. */
+.globl ossl_vaes_vpclmulqdq_capable
+.type ossl_vaes_vpclmulqdq_capable,@function
+.align 32
+ossl_vaes_vpclmulqdq_capable:
+ movq OPENSSL_ia32cap_P+8(%rip),%rcx
+
+ movq $6600291188736,%rdx
+ xorl %eax,%eax
+ andq %rdx,%rcx
+ cmpq %rdx,%rcx
+ cmoveq %rcx,%rax
+ .byte 0xf3,0xc3
+.size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
+.text
+.globl ossl_aes_gcm_init_avx512
+.type ossl_aes_gcm_init_avx512,@function
+.align 32
+ossl_aes_gcm_init_avx512:
+.cfi_startproc
+.byte 243,15,30,250
+ vpxorq %xmm16,%xmm16,%xmm16
+
+
+ movl 240(%rdi),%eax
+ cmpl $9,%eax
+ je .Laes_128_duiuljAybFADyhe
+ cmpl $11,%eax
+ je .Laes_192_duiuljAybFADyhe
+ cmpl $13,%eax
+ je .Laes_256_duiuljAybFADyhe
+ jmp .Lexit_aes_duiuljAybFADyhe
+.align 32
+.Laes_128_duiuljAybFADyhe:
+ vpxorq 0(%rdi),%xmm16,%xmm16
+
+ vaesenc 16(%rdi),%xmm16,%xmm16
+
+ vaesenc 32(%rdi),%xmm16,%xmm16
+
+ vaesenc 48(%rdi),%xmm16,%xmm16
+
+ vaesenc 64(%rdi),%xmm16,%xmm16
+
+ vaesenc 80(%rdi),%xmm16,%xmm16
+
+ vaesenc 96(%rdi),%xmm16,%xmm16
+
+ vaesenc 112(%rdi),%xmm16,%xmm16
+
+ vaesenc 128(%rdi),%xmm16,%xmm16
+
+ vaesenc 144(%rdi),%xmm16,%xmm16
+
+ vaesenclast 160(%rdi),%xmm16,%xmm16
+ jmp .Lexit_aes_duiuljAybFADyhe
+.align 32
+.Laes_192_duiuljAybFADyhe:
+ vpxorq 0(%rdi),%xmm16,%xmm16
+
+ vaesenc 16(%rdi),%xmm16,%xmm16
+
+ vaesenc 32(%rdi),%xmm16,%xmm16
+
+ vaesenc 48(%rdi),%xmm16,%xmm16
+
+ vaesenc 64(%rdi),%xmm16,%xmm16
+
+ vaesenc 80(%rdi),%xmm16,%xmm16
+
+ vaesenc 96(%rdi),%xmm16,%xmm16
+
+ vaesenc 112(%rdi),%xmm16,%xmm16
+
+ vaesenc 128(%rdi),%xmm16,%xmm16
+
+ vaesenc 144(%rdi),%xmm16,%xmm16
+
+ vaesenc 160(%rdi),%xmm16,%xmm16
+
+ vaesenc 176(%rdi),%xmm16,%xmm16
+
+ vaesenclast 192(%rdi),%xmm16,%xmm16
+ jmp .Lexit_aes_duiuljAybFADyhe
+.align 32
+.Laes_256_duiuljAybFADyhe:
+ vpxorq 0(%rdi),%xmm16,%xmm16
+
+ vaesenc 16(%rdi),%xmm16,%xmm16
+
+ vaesenc 32(%rdi),%xmm16,%xmm16
+
+ vaesenc 48(%rdi),%xmm16,%xmm16
+
+ vaesenc 64(%rdi),%xmm16,%xmm16
+
+ vaesenc 80(%rdi),%xmm16,%xmm16
+
+ vaesenc 96(%rdi),%xmm16,%xmm16
+
+ vaesenc 112(%rdi),%xmm16,%xmm16
+
+ vaesenc 128(%rdi),%xmm16,%xmm16
+
+ vaesenc 144(%rdi),%xmm16,%xmm16
+
+ vaesenc 160(%rdi),%xmm16,%xmm16
+
+ vaesenc 176(%rdi),%xmm16,%xmm16
+
+ vaesenc 192(%rdi),%xmm16,%xmm16
+
+ vaesenc 208(%rdi),%xmm16,%xmm16
+
+ vaesenclast 224(%rdi),%xmm16,%xmm16
+ jmp .Lexit_aes_duiuljAybFADyhe
+.Lexit_aes_duiuljAybFADyhe:
+
+ vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
+
+ vmovdqa64 %xmm16,%xmm2
+ vpsllq $1,%xmm16,%xmm16
+ vpsrlq $63,%xmm2,%xmm2
+ vmovdqa %xmm2,%xmm1
+ vpslldq $8,%xmm2,%xmm2
+ vpsrldq $8,%xmm1,%xmm1
+ vporq %xmm2,%xmm16,%xmm16
+
+ vpshufd $36,%xmm1,%xmm2
+ vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
+ vpand POLY(%rip),%xmm2,%xmm2
+ vpxorq %xmm2,%xmm16,%xmm16
+
+ vmovdqu64 %xmm16,336(%rsi)
+ vshufi32x4 $0x00,%ymm16,%ymm16,%ymm4
+ vmovdqa %ymm4,%ymm3
+
+ vpclmulqdq $0x11,%ymm4,%ymm3,%ymm0
+ vpclmulqdq $0x00,%ymm4,%ymm3,%ymm1
+ vpclmulqdq $0x01,%ymm4,%ymm3,%ymm2
+ vpclmulqdq $0x10,%ymm4,%ymm3,%ymm3
+ vpxorq %ymm2,%ymm3,%ymm3
+
+ vpsrldq $8,%ymm3,%ymm2
+ vpslldq $8,%ymm3,%ymm3
+ vpxorq %ymm2,%ymm0,%ymm0
+ vpxorq %ymm1,%ymm3,%ymm3
+
+
+
+ vmovdqu64 POLY2(%rip),%ymm2
+
+ vpclmulqdq $0x01,%ymm3,%ymm2,%ymm1
+ vpslldq $8,%ymm1,%ymm1
+ vpxorq %ymm1,%ymm3,%ymm3
+
+
+
+ vpclmulqdq $0x00,%ymm3,%ymm2,%ymm1
+ vpsrldq $4,%ymm1,%ymm1
+ vpclmulqdq $0x10,%ymm3,%ymm2,%ymm3
+ vpslldq $4,%ymm3,%ymm3
+
+ vpternlogq $0x96,%ymm1,%ymm0,%ymm3
+
+ vmovdqu64 %xmm3,320(%rsi)
+ vinserti64x2 $1,%xmm16,%ymm3,%ymm4
+ vmovdqa64 %ymm4,%ymm5
+
+ vpclmulqdq $0x11,%ymm3,%ymm4,%ymm0
+ vpclmulqdq $0x00,%ymm3,%ymm4,%ymm1
+ vpclmulqdq $0x01,%ymm3,%ymm4,%ymm2
+ vpclmulqdq $0x10,%ymm3,%ymm4,%ymm4
+ vpxorq %ymm2,%ymm4,%ymm4
+
+ vpsrldq $8,%ymm4,%ymm2
+ vpslldq $8,%ymm4,%ymm4
+ vpxorq %ymm2,%ymm0,%ymm0
+ vpxorq %ymm1,%ymm4,%ymm4
+
+
+
+ vmovdqu64 POLY2(%rip),%ymm2
+
+ vpclmulqdq $0x01,%ymm4,%ymm2,%ymm1
+ vpslldq $8,%ymm1,%ymm1
+ vpxorq %ymm1,%ymm4,%ymm4
+
+
+
+ vpclmulqdq $0x00,%ymm4,%ymm2,%ymm1
+ vpsrldq $4,%ymm1,%ymm1
+ vpclmulqdq $0x10,%ymm4,%ymm2,%ymm4
+ vpslldq $4,%ymm4,%ymm4
+
+ vpternlogq $0x96,%ymm1,%ymm0,%ymm4
+
+ vmovdqu64 %ymm4,288(%rsi)
+
+ vinserti64x4 $1,%ymm5,%zmm4,%zmm4
+
+
+ vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3
+ vmovdqa64 %zmm4,%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm2,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm2
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm2,%zmm0,%zmm0
+ vpxorq %zmm1,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm2
+
+ vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1
+ vpslldq $8,%zmm1,%zmm1
+ vpxorq %zmm1,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1
+ vpsrldq $4,%zmm1,%zmm1
+ vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm1,%zmm0,%zmm4
+
+ vmovdqu64 %zmm4,224(%rsi)
+ vshufi64x2 $0x00,%zmm4,%zmm4,%zmm3
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm0
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm1
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm2
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm2,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm2
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm2,%zmm0,%zmm0
+ vpxorq %zmm1,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm2
+
+ vpclmulqdq $0x01,%zmm5,%zmm2,%zmm1
+ vpslldq $8,%zmm1,%zmm1
+ vpxorq %zmm1,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm2,%zmm1
+ vpsrldq $4,%zmm1,%zmm1
+ vpclmulqdq $0x10,%zmm5,%zmm2,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm1,%zmm0,%zmm5
+
+ vmovdqu64 %zmm5,160(%rsi)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm0
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm1
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm2
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm2,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm2
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm2,%zmm0,%zmm0
+ vpxorq %zmm1,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm2
+
+ vpclmulqdq $0x01,%zmm4,%zmm2,%zmm1
+ vpslldq $8,%zmm1,%zmm1
+ vpxorq %zmm1,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm2,%zmm1
+ vpsrldq $4,%zmm1,%zmm1
+ vpclmulqdq $0x10,%zmm4,%zmm2,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm1,%zmm0,%zmm4
+
+ vmovdqu64 %zmm4,96(%rsi)
+ vzeroupper
+.Labort_init:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
+.globl ossl_aes_gcm_setiv_avx512
+.type ossl_aes_gcm_setiv_avx512,@function
+.align 32
+ossl_aes_gcm_setiv_avx512:
+.cfi_startproc
+.Lsetiv_seh_begin:
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+.Lsetiv_seh_push_rbx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+.Lsetiv_seh_push_rbp:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.Lsetiv_seh_push_r12:
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.Lsetiv_seh_push_r13:
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.Lsetiv_seh_push_r14:
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lsetiv_seh_push_r15:
+
+
+
+
+
+
+
+
+
+
+ leaq 0(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.Lsetiv_seh_setfp:
+
+.Lsetiv_seh_prolog_end:
+ subq $820,%rsp
+ andq $(-64),%rsp
+ cmpq $12,%rcx
+ je iv_len_12_init_IV
+ vpxor %xmm2,%xmm2,%xmm2
+ movq %rdx,%r10
+ movq %rcx,%r11
+ orq %r11,%r11
+ jz .L_CALC_AAD_done_mBgdvxqgFGebeug
+
+ xorq %rbx,%rbx
+ vmovdqa64 SHUF_MASK(%rip),%zmm16
+
+.L_get_AAD_loop48x16_mBgdvxqgFGebeug:
+ cmpq $768,%r11
+ jl .L_exit_AAD_loop48x16_mBgdvxqgFGebeug
+ vmovdqu64 0(%r10),%zmm11
+ vmovdqu64 64(%r10),%zmm3
+ vmovdqu64 128(%r10),%zmm4
+ vmovdqu64 192(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ testq %rbx,%rbx
+ jnz .L_skip_hkeys_precomputation_EzsAegbBbaerfwt
+
+ vmovdqu64 288(%rsi),%zmm1
+ vmovdqu64 %zmm1,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm9
+ vmovdqu64 %zmm9,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
+
+ vmovdqu64 160(%rsi),%zmm10
+ vmovdqu64 %zmm10,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm12
+ vmovdqu64 %zmm12,512(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,0(%rsp)
+.L_skip_hkeys_precomputation_EzsAegbBbaerfwt:
+ movq $1,%rbx
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 0(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 64(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpxorq %zmm17,%zmm10,%zmm7
+ vpxorq %zmm13,%zmm1,%zmm6
+ vpxorq %zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 128(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 192(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 256(%r10),%zmm11
+ vmovdqu64 320(%r10),%zmm3
+ vmovdqu64 384(%r10),%zmm4
+ vmovdqu64 448(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vmovdqu64 256(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 320(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 384(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 448(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 512(%r10),%zmm11
+ vmovdqu64 576(%r10),%zmm3
+ vmovdqu64 640(%r10),%zmm4
+ vmovdqu64 704(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vmovdqu64 512(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 576(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 640(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 704(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+
+ vpsrldq $8,%zmm7,%zmm1
+ vpslldq $8,%zmm7,%zmm9
+ vpxorq %zmm1,%zmm6,%zmm6
+ vpxorq %zmm9,%zmm8,%zmm8
+ vextracti64x4 $1,%zmm6,%ymm1
+ vpxorq %ymm1,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm1
+ vpxorq %xmm1,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm8,%ymm9
+ vpxorq %ymm9,%ymm8,%ymm8
+ vextracti32x4 $1,%ymm8,%xmm9
+ vpxorq %xmm9,%xmm8,%xmm8
+ vmovdqa64 POLY2(%rip),%xmm10
+
+
+ vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm1,%xmm8,%xmm1
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
+ vpsrldq $4,%xmm9,%xmm9
+ vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm6,%xmm9,%xmm2
+
+ subq $768,%r11
+ je .L_CALC_AAD_done_mBgdvxqgFGebeug
+
+ addq $768,%r10
+ jmp .L_get_AAD_loop48x16_mBgdvxqgFGebeug
+
+.L_exit_AAD_loop48x16_mBgdvxqgFGebeug:
+
+ cmpq $512,%r11
+ jl .L_less_than_32x16_mBgdvxqgFGebeug
+
+ vmovdqu64 0(%r10),%zmm11
+ vmovdqu64 64(%r10),%zmm3
+ vmovdqu64 128(%r10),%zmm4
+ vmovdqu64 192(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ testq %rbx,%rbx
+ jnz .L_skip_hkeys_precomputation_xCxmdbgxoCdwefc
+
+ vmovdqu64 288(%rsi),%zmm1
+ vmovdqu64 %zmm1,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm9
+ vmovdqu64 %zmm9,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
+
+ vmovdqu64 160(%rsi),%zmm10
+ vmovdqu64 %zmm10,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm12
+ vmovdqu64 %zmm12,512(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,256(%rsp)
+.L_skip_hkeys_precomputation_xCxmdbgxoCdwefc:
+ movq $1,%rbx
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 256(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 320(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpxorq %zmm17,%zmm10,%zmm7
+ vpxorq %zmm13,%zmm1,%zmm6
+ vpxorq %zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 384(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 448(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 256(%r10),%zmm11
+ vmovdqu64 320(%r10),%zmm3
+ vmovdqu64 384(%r10),%zmm4
+ vmovdqu64 448(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vmovdqu64 512(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 576(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 640(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 704(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+
+ vpsrldq $8,%zmm7,%zmm1
+ vpslldq $8,%zmm7,%zmm9
+ vpxorq %zmm1,%zmm6,%zmm6
+ vpxorq %zmm9,%zmm8,%zmm8
+ vextracti64x4 $1,%zmm6,%ymm1
+ vpxorq %ymm1,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm1
+ vpxorq %xmm1,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm8,%ymm9
+ vpxorq %ymm9,%ymm8,%ymm8
+ vextracti32x4 $1,%ymm8,%xmm9
+ vpxorq %xmm9,%xmm8,%xmm8
+ vmovdqa64 POLY2(%rip),%xmm10
+
+
+ vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm1,%xmm8,%xmm1
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
+ vpsrldq $4,%xmm9,%xmm9
+ vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm6,%xmm9,%xmm2
+
+ subq $512,%r11
+ je .L_CALC_AAD_done_mBgdvxqgFGebeug
+
+ addq $512,%r10
+ jmp .L_less_than_16x16_mBgdvxqgFGebeug
+
+.L_less_than_32x16_mBgdvxqgFGebeug:
+ cmpq $256,%r11
+ jl .L_less_than_16x16_mBgdvxqgFGebeug
+
+ vmovdqu64 0(%r10),%zmm11
+ vmovdqu64 64(%r10),%zmm3
+ vmovdqu64 128(%r10),%zmm4
+ vmovdqu64 192(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 96(%rsi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 160(%rsi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpxorq %zmm17,%zmm10,%zmm7
+ vpxorq %zmm13,%zmm1,%zmm6
+ vpxorq %zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 224(%rsi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 288(%rsi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+
+ vpsrldq $8,%zmm7,%zmm1
+ vpslldq $8,%zmm7,%zmm9
+ vpxorq %zmm1,%zmm6,%zmm6
+ vpxorq %zmm9,%zmm8,%zmm8
+ vextracti64x4 $1,%zmm6,%ymm1
+ vpxorq %ymm1,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm1
+ vpxorq %xmm1,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm8,%ymm9
+ vpxorq %ymm9,%ymm8,%ymm8
+ vextracti32x4 $1,%ymm8,%xmm9
+ vpxorq %xmm9,%xmm8,%xmm8
+ vmovdqa64 POLY2(%rip),%xmm10
+
+
+ vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm1,%xmm8,%xmm1
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
+ vpsrldq $4,%xmm9,%xmm9
+ vpclmulqdq $0x10,%xmm1,%xmm10,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm6,%xmm9,%xmm2
+
+ subq $256,%r11
+ je .L_CALC_AAD_done_mBgdvxqgFGebeug
+
+ addq $256,%r10
+
+.L_less_than_16x16_mBgdvxqgFGebeug:
+
+ leaq byte64_len_to_mask_table(%rip),%r12
+ leaq (%r12,%r11,8),%r12
+
+
+ addl $15,%r11d
+ shrl $4,%r11d
+ cmpl $2,%r11d
+ jb .L_AAD_blocks_1_mBgdvxqgFGebeug
+ je .L_AAD_blocks_2_mBgdvxqgFGebeug
+ cmpl $4,%r11d
+ jb .L_AAD_blocks_3_mBgdvxqgFGebeug
+ je .L_AAD_blocks_4_mBgdvxqgFGebeug
+ cmpl $6,%r11d
+ jb .L_AAD_blocks_5_mBgdvxqgFGebeug
+ je .L_AAD_blocks_6_mBgdvxqgFGebeug
+ cmpl $8,%r11d
+ jb .L_AAD_blocks_7_mBgdvxqgFGebeug
+ je .L_AAD_blocks_8_mBgdvxqgFGebeug
+ cmpl $10,%r11d
+ jb .L_AAD_blocks_9_mBgdvxqgFGebeug
+ je .L_AAD_blocks_10_mBgdvxqgFGebeug
+ cmpl $12,%r11d
+ jb .L_AAD_blocks_11_mBgdvxqgFGebeug
+ je .L_AAD_blocks_12_mBgdvxqgFGebeug
+ cmpl $14,%r11d
+ jb .L_AAD_blocks_13_mBgdvxqgFGebeug
+ je .L_AAD_blocks_14_mBgdvxqgFGebeug
+ cmpl $15,%r11d
+ je .L_AAD_blocks_15_mBgdvxqgFGebeug
+.L_AAD_blocks_16_mBgdvxqgFGebeug:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%zmm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 96(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 160(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 224(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm9,%zmm11,%zmm1
+ vpternlogq $0x96,%zmm10,%zmm3,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm12,%zmm11,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm3,%zmm8
+ vmovdqu64 288(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_15_mBgdvxqgFGebeug:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%zmm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 112(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 176(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 240(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+ vmovdqu64 304(%rsi),%ymm15
+ vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_14_mBgdvxqgFGebeug:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%ymm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %ymm16,%ymm5,%ymm5
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 128(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 192(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 256(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+ vmovdqu64 320(%rsi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_13_mBgdvxqgFGebeug:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%xmm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %xmm16,%xmm5,%xmm5
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 144(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 208(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 272(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+ vmovdqu64 336(%rsi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_12_mBgdvxqgFGebeug:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 160(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 224(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 288(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_11_mBgdvxqgFGebeug:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 176(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 240(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+ vmovdqu64 304(%rsi),%ymm15
+ vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_10_mBgdvxqgFGebeug:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%ymm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %ymm16,%ymm4,%ymm4
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 192(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 256(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+ vmovdqu64 320(%rsi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_9_mBgdvxqgFGebeug:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%xmm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %xmm16,%xmm4,%xmm4
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 208(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 272(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+ vmovdqu64 336(%rsi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_8_mBgdvxqgFGebeug:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 224(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 288(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_7_mBgdvxqgFGebeug:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 240(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+ vmovdqu64 304(%rsi),%ymm15
+ vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_6_mBgdvxqgFGebeug:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%ymm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %ymm16,%ymm3,%ymm3
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 256(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+ vmovdqu64 320(%rsi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_5_mBgdvxqgFGebeug:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%xmm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %xmm16,%xmm3,%xmm3
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 272(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+ vmovdqu64 336(%rsi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_4_mBgdvxqgFGebeug:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 288(%rsi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_3_mBgdvxqgFGebeug:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 304(%rsi),%ymm15
+ vinserti64x2 $2,336(%rsi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_2_mBgdvxqgFGebeug:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%ymm11{%k1}{z}
+ vpshufb %ymm16,%ymm11,%ymm11
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 320(%rsi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+ jmp .L_CALC_AAD_done_mBgdvxqgFGebeug
+.L_AAD_blocks_1_mBgdvxqgFGebeug:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%xmm11{%k1}{z}
+ vpshufb %xmm16,%xmm11,%xmm11
+ vpxorq %zmm2,%zmm11,%zmm11
+ vmovdqu64 336(%rsi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm2
+
+.L_CALC_AAD_done_mBgdvxqgFGebeug:
+ movq %rcx,%r10
+ shlq $3,%r10
+ vmovq %r10,%xmm3
+
+
+ vpxorq %xmm2,%xmm3,%xmm2
+
+ vmovdqu64 336(%rsi),%xmm1
+
+ vpclmulqdq $0x11,%xmm1,%xmm2,%xmm11
+ vpclmulqdq $0x00,%xmm1,%xmm2,%xmm3
+ vpclmulqdq $0x01,%xmm1,%xmm2,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm2,%xmm2
+ vpxorq %xmm4,%xmm2,%xmm2
+
+ vpsrldq $8,%xmm2,%xmm4
+ vpslldq $8,%xmm2,%xmm2
+ vpxorq %xmm4,%xmm11,%xmm11
+ vpxorq %xmm3,%xmm2,%xmm2
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm4
+
+ vpclmulqdq $0x01,%xmm2,%xmm4,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm2,%xmm2
+
+
+
+ vpclmulqdq $0x00,%xmm2,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm2,%xmm4,%xmm2
+ vpslldq $4,%xmm2,%xmm2
+
+ vpternlogq $0x96,%xmm3,%xmm11,%xmm2
+
+ vpshufb SHUF_MASK(%rip),%xmm2,%xmm2
+ jmp skip_iv_len_12_init_IV
+iv_len_12_init_IV:
+
+ vmovdqu8 ONEf(%rip),%xmm2
+ movq %rdx,%r11
+ movl $0x0000000000000fff,%r10d
+ kmovq %r10,%k1
+ vmovdqu8 (%r11),%xmm2{%k1}
+skip_iv_len_12_init_IV:
+ vmovdqu %xmm2,%xmm1
+
+
+ movl 240(%rdi),%r10d
+ cmpl $9,%r10d
+ je .Laes_128_wbuuzwjyGbjeaox
+ cmpl $11,%r10d
+ je .Laes_192_wbuuzwjyGbjeaox
+ cmpl $13,%r10d
+ je .Laes_256_wbuuzwjyGbjeaox
+ jmp .Lexit_aes_wbuuzwjyGbjeaox
+.align 32
+.Laes_128_wbuuzwjyGbjeaox:
+ vpxorq 0(%rdi),%xmm1,%xmm1
+
+ vaesenc 16(%rdi),%xmm1,%xmm1
+
+ vaesenc 32(%rdi),%xmm1,%xmm1
+
+ vaesenc 48(%rdi),%xmm1,%xmm1
+
+ vaesenc 64(%rdi),%xmm1,%xmm1
+
+ vaesenc 80(%rdi),%xmm1,%xmm1
+
+ vaesenc 96(%rdi),%xmm1,%xmm1
+
+ vaesenc 112(%rdi),%xmm1,%xmm1
+
+ vaesenc 128(%rdi),%xmm1,%xmm1
+
+ vaesenc 144(%rdi),%xmm1,%xmm1
+
+ vaesenclast 160(%rdi),%xmm1,%xmm1
+ jmp .Lexit_aes_wbuuzwjyGbjeaox
+.align 32
+.Laes_192_wbuuzwjyGbjeaox:
+ vpxorq 0(%rdi),%xmm1,%xmm1
+
+ vaesenc 16(%rdi),%xmm1,%xmm1
+
+ vaesenc 32(%rdi),%xmm1,%xmm1
+
+ vaesenc 48(%rdi),%xmm1,%xmm1
+
+ vaesenc 64(%rdi),%xmm1,%xmm1
+
+ vaesenc 80(%rdi),%xmm1,%xmm1
+
+ vaesenc 96(%rdi),%xmm1,%xmm1
+
+ vaesenc 112(%rdi),%xmm1,%xmm1
+
+ vaesenc 128(%rdi),%xmm1,%xmm1
+
+ vaesenc 144(%rdi),%xmm1,%xmm1
+
+ vaesenc 160(%rdi),%xmm1,%xmm1
+
+ vaesenc 176(%rdi),%xmm1,%xmm1
+
+ vaesenclast 192(%rdi),%xmm1,%xmm1
+ jmp .Lexit_aes_wbuuzwjyGbjeaox
+.align 32
+.Laes_256_wbuuzwjyGbjeaox:
+ vpxorq 0(%rdi),%xmm1,%xmm1
+
+ vaesenc 16(%rdi),%xmm1,%xmm1
+
+ vaesenc 32(%rdi),%xmm1,%xmm1
+
+ vaesenc 48(%rdi),%xmm1,%xmm1
+
+ vaesenc 64(%rdi),%xmm1,%xmm1
+
+ vaesenc 80(%rdi),%xmm1,%xmm1
+
+ vaesenc 96(%rdi),%xmm1,%xmm1
+
+ vaesenc 112(%rdi),%xmm1,%xmm1
+
+ vaesenc 128(%rdi),%xmm1,%xmm1
+
+ vaesenc 144(%rdi),%xmm1,%xmm1
+
+ vaesenc 160(%rdi),%xmm1,%xmm1
+
+ vaesenc 176(%rdi),%xmm1,%xmm1
+
+ vaesenc 192(%rdi),%xmm1,%xmm1
+
+ vaesenc 208(%rdi),%xmm1,%xmm1
+
+ vaesenclast 224(%rdi),%xmm1,%xmm1
+ jmp .Lexit_aes_wbuuzwjyGbjeaox
+.Lexit_aes_wbuuzwjyGbjeaox:
+
+ vmovdqu %xmm1,32(%rsi)
+
+
+ vpshufb SHUF_MASK(%rip),%xmm2,%xmm2
+ vmovdqu %xmm2,0(%rsi)
+ cmpq $256,%rcx
+ jbe .Lskip_hkeys_cleanup_pseltoyDnFwppqb
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa64 %zmm0,0(%rsp)
+ vmovdqa64 %zmm0,64(%rsp)
+ vmovdqa64 %zmm0,128(%rsp)
+ vmovdqa64 %zmm0,192(%rsp)
+ vmovdqa64 %zmm0,256(%rsp)
+ vmovdqa64 %zmm0,320(%rsp)
+ vmovdqa64 %zmm0,384(%rsp)
+ vmovdqa64 %zmm0,448(%rsp)
+ vmovdqa64 %zmm0,512(%rsp)
+ vmovdqa64 %zmm0,576(%rsp)
+ vmovdqa64 %zmm0,640(%rsp)
+ vmovdqa64 %zmm0,704(%rsp)
+.Lskip_hkeys_cleanup_pseltoyDnFwppqb:
+ vzeroupper
+ leaq (%rbp),%rsp
+.cfi_def_cfa_register %rsp
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+.Labort_setiv:
+ .byte 0xf3,0xc3
+.Lsetiv_seh_end:
+.cfi_endproc
+.size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
+.globl ossl_aes_gcm_update_aad_avx512
+.type ossl_aes_gcm_update_aad_avx512,@function
+.align 32
+ossl_aes_gcm_update_aad_avx512:
+.cfi_startproc
+.Lghash_seh_begin:
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+.Lghash_seh_push_rbx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+.Lghash_seh_push_rbp:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.Lghash_seh_push_r12:
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.Lghash_seh_push_r13:
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.Lghash_seh_push_r14:
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lghash_seh_push_r15:
+
+
+
+
+
+
+
+
+
+
+ leaq 0(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.Lghash_seh_setfp:
+
+.Lghash_seh_prolog_end:
+ subq $820,%rsp
+ andq $(-64),%rsp
+ vmovdqu64 64(%rdi),%xmm14
+ movq %rsi,%r10
+ movq %rdx,%r11
+ orq %r11,%r11
+ jz .L_CALC_AAD_done_ijFECAxDcrvrgja
+
+ xorq %rbx,%rbx
+ vmovdqa64 SHUF_MASK(%rip),%zmm16
+
+.L_get_AAD_loop48x16_ijFECAxDcrvrgja:
+ cmpq $768,%r11
+ jl .L_exit_AAD_loop48x16_ijFECAxDcrvrgja
+ vmovdqu64 0(%r10),%zmm11
+ vmovdqu64 64(%r10),%zmm3
+ vmovdqu64 128(%r10),%zmm4
+ vmovdqu64 192(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ testq %rbx,%rbx
+ jnz .L_skip_hkeys_precomputation_AfEjmfnrFdFcycC
+
+ vmovdqu64 288(%rdi),%zmm1
+ vmovdqu64 %zmm1,704(%rsp)
+
+ vmovdqu64 224(%rdi),%zmm9
+ vmovdqu64 %zmm9,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
+
+ vmovdqu64 160(%rdi),%zmm10
+ vmovdqu64 %zmm10,576(%rsp)
+
+ vmovdqu64 96(%rdi),%zmm12
+ vmovdqu64 %zmm12,512(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,0(%rsp)
+.L_skip_hkeys_precomputation_AfEjmfnrFdFcycC:
+ movq $1,%rbx
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 0(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 64(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpxorq %zmm17,%zmm10,%zmm7
+ vpxorq %zmm13,%zmm1,%zmm6
+ vpxorq %zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 128(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 192(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 256(%r10),%zmm11
+ vmovdqu64 320(%r10),%zmm3
+ vmovdqu64 384(%r10),%zmm4
+ vmovdqu64 448(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vmovdqu64 256(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 320(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 384(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 448(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 512(%r10),%zmm11
+ vmovdqu64 576(%r10),%zmm3
+ vmovdqu64 640(%r10),%zmm4
+ vmovdqu64 704(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vmovdqu64 512(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 576(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 640(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 704(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+
+ vpsrldq $8,%zmm7,%zmm1
+ vpslldq $8,%zmm7,%zmm9
+ vpxorq %zmm1,%zmm6,%zmm6
+ vpxorq %zmm9,%zmm8,%zmm8
+ vextracti64x4 $1,%zmm6,%ymm1
+ vpxorq %ymm1,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm1
+ vpxorq %xmm1,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm8,%ymm9
+ vpxorq %ymm9,%ymm8,%ymm8
+ vextracti32x4 $1,%ymm8,%xmm9
+ vpxorq %xmm9,%xmm8,%xmm8
+ vmovdqa64 POLY2(%rip),%xmm10
+
+
+ vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm1,%xmm8,%xmm1
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
+ vpsrldq $4,%xmm9,%xmm9
+ vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm6,%xmm9,%xmm14
+
+ subq $768,%r11
+ je .L_CALC_AAD_done_ijFECAxDcrvrgja
+
+ addq $768,%r10
+ jmp .L_get_AAD_loop48x16_ijFECAxDcrvrgja
+
+.L_exit_AAD_loop48x16_ijFECAxDcrvrgja:
+
+ cmpq $512,%r11
+ jl .L_less_than_32x16_ijFECAxDcrvrgja
+
+ vmovdqu64 0(%r10),%zmm11
+ vmovdqu64 64(%r10),%zmm3
+ vmovdqu64 128(%r10),%zmm4
+ vmovdqu64 192(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ testq %rbx,%rbx
+ jnz .L_skip_hkeys_precomputation_kvsjACAeAekBEdd
+
+ vmovdqu64 288(%rdi),%zmm1
+ vmovdqu64 %zmm1,704(%rsp)
+
+ vmovdqu64 224(%rdi),%zmm9
+ vmovdqu64 %zmm9,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm9,%zmm9,%zmm9
+
+ vmovdqu64 160(%rdi),%zmm10
+ vmovdqu64 %zmm10,576(%rsp)
+
+ vmovdqu64 96(%rdi),%zmm12
+ vmovdqu64 %zmm12,512(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm10,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm10,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm10,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm10,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm17
+ vpslldq $8,%zmm10,%zmm10
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm10,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm10,%zmm10
+
+
+
+ vpclmulqdq $0x00,%zmm10,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm10,%zmm17,%zmm10
+ vpslldq $4,%zmm10,%zmm10
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm10
+
+ vmovdqu64 %zmm10,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm9,%zmm12,%zmm13
+ vpclmulqdq $0x00,%zmm9,%zmm12,%zmm15
+ vpclmulqdq $0x01,%zmm9,%zmm12,%zmm17
+ vpclmulqdq $0x10,%zmm9,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm12,%zmm12
+
+ vpsrldq $8,%zmm12,%zmm17
+ vpslldq $8,%zmm12,%zmm12
+ vpxorq %zmm17,%zmm13,%zmm13
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm17
+
+ vpclmulqdq $0x01,%zmm12,%zmm17,%zmm15
+ vpslldq $8,%zmm15,%zmm15
+ vpxorq %zmm15,%zmm12,%zmm12
+
+
+
+ vpclmulqdq $0x00,%zmm12,%zmm17,%zmm15
+ vpsrldq $4,%zmm15,%zmm15
+ vpclmulqdq $0x10,%zmm12,%zmm17,%zmm12
+ vpslldq $4,%zmm12,%zmm12
+
+ vpternlogq $0x96,%zmm15,%zmm13,%zmm12
+
+ vmovdqu64 %zmm12,256(%rsp)
+.L_skip_hkeys_precomputation_kvsjACAeAekBEdd:
+ movq $1,%rbx
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 256(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 320(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpxorq %zmm17,%zmm10,%zmm7
+ vpxorq %zmm13,%zmm1,%zmm6
+ vpxorq %zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 384(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 448(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 256(%r10),%zmm11
+ vmovdqu64 320(%r10),%zmm3
+ vmovdqu64 384(%r10),%zmm4
+ vmovdqu64 448(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vmovdqu64 512(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 576(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 640(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 704(%rsp),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+
+ vpsrldq $8,%zmm7,%zmm1
+ vpslldq $8,%zmm7,%zmm9
+ vpxorq %zmm1,%zmm6,%zmm6
+ vpxorq %zmm9,%zmm8,%zmm8
+ vextracti64x4 $1,%zmm6,%ymm1
+ vpxorq %ymm1,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm1
+ vpxorq %xmm1,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm8,%ymm9
+ vpxorq %ymm9,%ymm8,%ymm8
+ vextracti32x4 $1,%ymm8,%xmm9
+ vpxorq %xmm9,%xmm8,%xmm8
+ vmovdqa64 POLY2(%rip),%xmm10
+
+
+ vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm1,%xmm8,%xmm1
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
+ vpsrldq $4,%xmm9,%xmm9
+ vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm6,%xmm9,%xmm14
+
+ subq $512,%r11
+ je .L_CALC_AAD_done_ijFECAxDcrvrgja
+
+ addq $512,%r10
+ jmp .L_less_than_16x16_ijFECAxDcrvrgja
+
+.L_less_than_32x16_ijFECAxDcrvrgja:
+ cmpq $256,%r11
+ jl .L_less_than_16x16_ijFECAxDcrvrgja
+
+ vmovdqu64 0(%r10),%zmm11
+ vmovdqu64 64(%r10),%zmm3
+ vmovdqu64 128(%r10),%zmm4
+ vmovdqu64 192(%r10),%zmm5
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 96(%rdi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm11,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm11,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm11,%zmm12
+ vmovdqu64 160(%rdi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm3,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm3,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm3,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm3,%zmm18
+ vpxorq %zmm17,%zmm10,%zmm7
+ vpxorq %zmm13,%zmm1,%zmm6
+ vpxorq %zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+ vmovdqu64 224(%rdi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm19,%zmm4,%zmm9
+ vpclmulqdq $0x01,%zmm19,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm19,%zmm4,%zmm12
+ vmovdqu64 288(%rdi),%zmm19
+ vpclmulqdq $0x11,%zmm19,%zmm5,%zmm13
+ vpclmulqdq $0x00,%zmm19,%zmm5,%zmm15
+ vpclmulqdq $0x01,%zmm19,%zmm5,%zmm17
+ vpclmulqdq $0x10,%zmm19,%zmm5,%zmm18
+
+ vpternlogq $0x96,%zmm17,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm1,%zmm6
+ vpternlogq $0x96,%zmm15,%zmm9,%zmm8
+ vpternlogq $0x96,%zmm18,%zmm12,%zmm7
+
+ vpsrldq $8,%zmm7,%zmm1
+ vpslldq $8,%zmm7,%zmm9
+ vpxorq %zmm1,%zmm6,%zmm6
+ vpxorq %zmm9,%zmm8,%zmm8
+ vextracti64x4 $1,%zmm6,%ymm1
+ vpxorq %ymm1,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm1
+ vpxorq %xmm1,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm8,%ymm9
+ vpxorq %ymm9,%ymm8,%ymm8
+ vextracti32x4 $1,%ymm8,%xmm9
+ vpxorq %xmm9,%xmm8,%xmm8
+ vmovdqa64 POLY2(%rip),%xmm10
+
+
+ vpclmulqdq $0x01,%xmm8,%xmm10,%xmm1
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm1,%xmm8,%xmm1
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm10,%xmm9
+ vpsrldq $4,%xmm9,%xmm9
+ vpclmulqdq $0x10,%xmm1,%xmm10,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm6,%xmm9,%xmm14
+
+ subq $256,%r11
+ je .L_CALC_AAD_done_ijFECAxDcrvrgja
+
+ addq $256,%r10
+
+.L_less_than_16x16_ijFECAxDcrvrgja:
+
+ leaq byte64_len_to_mask_table(%rip),%r12
+ leaq (%r12,%r11,8),%r12
+
+
+ addl $15,%r11d
+ shrl $4,%r11d
+ cmpl $2,%r11d
+ jb .L_AAD_blocks_1_ijFECAxDcrvrgja
+ je .L_AAD_blocks_2_ijFECAxDcrvrgja
+ cmpl $4,%r11d
+ jb .L_AAD_blocks_3_ijFECAxDcrvrgja
+ je .L_AAD_blocks_4_ijFECAxDcrvrgja
+ cmpl $6,%r11d
+ jb .L_AAD_blocks_5_ijFECAxDcrvrgja
+ je .L_AAD_blocks_6_ijFECAxDcrvrgja
+ cmpl $8,%r11d
+ jb .L_AAD_blocks_7_ijFECAxDcrvrgja
+ je .L_AAD_blocks_8_ijFECAxDcrvrgja
+ cmpl $10,%r11d
+ jb .L_AAD_blocks_9_ijFECAxDcrvrgja
+ je .L_AAD_blocks_10_ijFECAxDcrvrgja
+ cmpl $12,%r11d
+ jb .L_AAD_blocks_11_ijFECAxDcrvrgja
+ je .L_AAD_blocks_12_ijFECAxDcrvrgja
+ cmpl $14,%r11d
+ jb .L_AAD_blocks_13_ijFECAxDcrvrgja
+ je .L_AAD_blocks_14_ijFECAxDcrvrgja
+ cmpl $15,%r11d
+ je .L_AAD_blocks_15_ijFECAxDcrvrgja
+.L_AAD_blocks_16_ijFECAxDcrvrgja:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%zmm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 96(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 160(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 224(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm9,%zmm11,%zmm1
+ vpternlogq $0x96,%zmm10,%zmm3,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm12,%zmm11,%zmm7
+ vpternlogq $0x96,%zmm13,%zmm3,%zmm8
+ vmovdqu64 288(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm5,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm5,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm5,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm5,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_15_ijFECAxDcrvrgja:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%zmm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %zmm16,%zmm5,%zmm5
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 112(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 176(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 240(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+ vmovdqu64 304(%rdi),%ymm15
+ vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm5,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm5,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm5,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm5,%zmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_14_ijFECAxDcrvrgja:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%ymm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %ymm16,%ymm5,%ymm5
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 128(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 192(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 256(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+ vmovdqu64 320(%rdi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm5,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm5,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm5,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm5,%ymm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_13_ijFECAxDcrvrgja:
+ subq $1536,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4
+ vmovdqu8 192(%r10),%xmm5{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpshufb %xmm16,%xmm5,%xmm5
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 144(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 208(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 272(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+ vmovdqu64 336(%rdi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm5,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm5,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm5,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_12_ijFECAxDcrvrgja:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 160(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 224(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vmovdqu64 288(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm1,%zmm11,%zmm9
+ vpternlogq $0x96,%zmm6,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm11
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm3
+ vpternlogq $0x96,%zmm7,%zmm11,%zmm12
+ vpternlogq $0x96,%zmm8,%zmm3,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_11_ijFECAxDcrvrgja:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%zmm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %zmm16,%zmm4,%zmm4
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 176(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 240(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+ vmovdqu64 304(%rdi),%ymm15
+ vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm4,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm4,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm4,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm4,%zmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_10_ijFECAxDcrvrgja:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%ymm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %ymm16,%ymm4,%ymm4
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 192(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 256(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+ vmovdqu64 320(%rdi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm4,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm4,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm4,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm4,%ymm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_9_ijFECAxDcrvrgja:
+ subq $1024,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3
+ vmovdqu8 128(%r10),%xmm4{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpshufb %xmm16,%xmm4,%xmm4
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 208(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 272(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+ vmovdqu64 336(%rdi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm4,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm4,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm4,%xmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_8_ijFECAxDcrvrgja:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 224(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vmovdqu64 288(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm13
+ vpxorq %zmm9,%zmm1,%zmm9
+ vpxorq %zmm10,%zmm6,%zmm10
+ vpxorq %zmm12,%zmm7,%zmm12
+ vpxorq %zmm13,%zmm8,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_7_ijFECAxDcrvrgja:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%zmm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %zmm16,%zmm3,%zmm3
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 240(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+ vmovdqu64 304(%rdi),%ymm15
+ vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm3,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm3,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm3,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm3,%zmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_6_ijFECAxDcrvrgja:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%ymm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %ymm16,%ymm3,%ymm3
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 256(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+ vmovdqu64 320(%rdi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm3,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm3,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm3,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm3,%ymm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_5_ijFECAxDcrvrgja:
+ subq $512,%r12
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11
+ vmovdqu8 64(%r10),%xmm3{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpshufb %xmm16,%xmm3,%xmm3
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 272(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+ vmovdqu64 336(%rdi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm3,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm3,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm3,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm3,%xmm6
+
+ vpxorq %zmm12,%zmm7,%zmm7
+ vpxorq %zmm13,%zmm8,%zmm8
+ vpxorq %zmm9,%zmm1,%zmm1
+ vpxorq %zmm10,%zmm6,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_4_ijFECAxDcrvrgja:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 288(%rdi),%zmm15
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm9
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm10
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm12
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm13
+
+ vpxorq %zmm13,%zmm12,%zmm12
+ vpsrldq $8,%zmm12,%zmm7
+ vpslldq $8,%zmm12,%zmm8
+ vpxorq %zmm7,%zmm9,%zmm1
+ vpxorq %zmm8,%zmm10,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_3_ijFECAxDcrvrgja:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%zmm11{%k1}{z}
+ vpshufb %zmm16,%zmm11,%zmm11
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 304(%rdi),%ymm15
+ vinserti64x2 $2,336(%rdi),%zmm15,%zmm15
+ vpclmulqdq $0x01,%zmm15,%zmm11,%zmm7
+ vpclmulqdq $0x10,%zmm15,%zmm11,%zmm8
+ vpclmulqdq $0x11,%zmm15,%zmm11,%zmm1
+ vpclmulqdq $0x00,%zmm15,%zmm11,%zmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_2_ijFECAxDcrvrgja:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%ymm11{%k1}{z}
+ vpshufb %ymm16,%ymm11,%ymm11
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 320(%rdi),%ymm15
+ vpclmulqdq $0x01,%ymm15,%ymm11,%ymm7
+ vpclmulqdq $0x10,%ymm15,%ymm11,%ymm8
+ vpclmulqdq $0x11,%ymm15,%ymm11,%ymm1
+ vpclmulqdq $0x00,%ymm15,%ymm11,%ymm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+ jmp .L_CALC_AAD_done_ijFECAxDcrvrgja
+.L_AAD_blocks_1_ijFECAxDcrvrgja:
+ kmovq (%r12),%k1
+ vmovdqu8 0(%r10),%xmm11{%k1}{z}
+ vpshufb %xmm16,%xmm11,%xmm11
+ vpxorq %zmm14,%zmm11,%zmm11
+ vmovdqu64 336(%rdi),%xmm15
+ vpclmulqdq $0x01,%xmm15,%xmm11,%xmm7
+ vpclmulqdq $0x10,%xmm15,%xmm11,%xmm8
+ vpclmulqdq $0x11,%xmm15,%xmm11,%xmm1
+ vpclmulqdq $0x00,%xmm15,%xmm11,%xmm6
+
+ vpxorq %zmm8,%zmm7,%zmm7
+ vpsrldq $8,%zmm7,%zmm12
+ vpslldq $8,%zmm7,%zmm13
+ vpxorq %zmm12,%zmm1,%zmm1
+ vpxorq %zmm13,%zmm6,%zmm6
+ vextracti64x4 $1,%zmm1,%ymm12
+ vpxorq %ymm12,%ymm1,%ymm1
+ vextracti32x4 $1,%ymm1,%xmm12
+ vpxorq %xmm12,%xmm1,%xmm1
+ vextracti64x4 $1,%zmm6,%ymm13
+ vpxorq %ymm13,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm13
+ vpxorq %xmm13,%xmm6,%xmm6
+ vmovdqa64 POLY2(%rip),%xmm15
+
+
+ vpclmulqdq $0x01,%xmm6,%xmm15,%xmm7
+ vpslldq $8,%xmm7,%xmm7
+ vpxorq %xmm7,%xmm6,%xmm7
+
+
+ vpclmulqdq $0x00,%xmm7,%xmm15,%xmm8
+ vpsrldq $4,%xmm8,%xmm8
+ vpclmulqdq $0x10,%xmm7,%xmm15,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm1,%xmm8,%xmm14
+
+.L_CALC_AAD_done_ijFECAxDcrvrgja:
+ vmovdqu64 %xmm14,64(%rdi)
+ cmpq $256,%rdx
+ jbe .Lskip_hkeys_cleanup_qbvewaDGpzpiiAA
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa64 %zmm0,0(%rsp)
+ vmovdqa64 %zmm0,64(%rsp)
+ vmovdqa64 %zmm0,128(%rsp)
+ vmovdqa64 %zmm0,192(%rsp)
+ vmovdqa64 %zmm0,256(%rsp)
+ vmovdqa64 %zmm0,320(%rsp)
+ vmovdqa64 %zmm0,384(%rsp)
+ vmovdqa64 %zmm0,448(%rsp)
+ vmovdqa64 %zmm0,512(%rsp)
+ vmovdqa64 %zmm0,576(%rsp)
+ vmovdqa64 %zmm0,640(%rsp)
+ vmovdqa64 %zmm0,704(%rsp)
+.Lskip_hkeys_cleanup_qbvewaDGpzpiiAA:
+ vzeroupper
+ leaq (%rbp),%rsp
+.cfi_def_cfa_register %rsp
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+.Lexit_update_aad:
+ .byte 0xf3,0xc3
+.Lghash_seh_end:
+.cfi_endproc
+.size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
+.globl ossl_aes_gcm_encrypt_avx512
+.type ossl_aes_gcm_encrypt_avx512,@function
+.align 32
+ossl_aes_gcm_encrypt_avx512:
+.cfi_startproc
+.Lencrypt_seh_begin:
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+.Lencrypt_seh_push_rbx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+.Lencrypt_seh_push_rbp:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.Lencrypt_seh_push_r12:
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.Lencrypt_seh_push_r13:
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.Lencrypt_seh_push_r14:
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lencrypt_seh_push_r15:
+
+
+
+
+
+
+
+
+
+
+ leaq 0(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.Lencrypt_seh_setfp:
+
+.Lencrypt_seh_prolog_end:
+ subq $1588,%rsp
+ andq $(-64),%rsp
+
+
+ movl 240(%rdi),%eax
+ cmpl $9,%eax
+ je .Laes_gcm_encrypt_128_avx512
+ cmpl $11,%eax
+ je .Laes_gcm_encrypt_192_avx512
+ cmpl $13,%eax
+ je .Laes_gcm_encrypt_256_avx512
+ xorl %eax,%eax
+ jmp .Lexit_gcm_encrypt
+.align 32
+.Laes_gcm_encrypt_128_avx512:
+ orq %r8,%r8
+ je .L_enc_dec_done_pdDdEbGtmhbgzzj
+ xorq %r14,%r14
+ vmovdqu64 64(%rsi),%xmm14
+
+ movq (%rdx),%r11
+ orq %r11,%r11
+ je .L_partial_block_done_pxhfCnBixjkllFd
+ movl $16,%r10d
+ leaq byte_len_to_mask_table(%rip),%r12
+ cmpq %r10,%r8
+ cmovcq %r8,%r10
+ kmovw (%r12,%r10,2),%k1
+ vmovdqu8 (%rcx),%xmm0{%k1}{z}
+
+ vmovdqu64 16(%rsi),%xmm3
+ vmovdqu64 336(%rsi),%xmm4
+
+
+
+ leaq SHIFT_MASK(%rip),%r12
+ addq %r11,%r12
+ vmovdqu64 (%r12),%xmm5
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpxorq %xmm0,%xmm3,%xmm3
+
+
+ leaq (%r8,%r11,1),%r13
+ subq $16,%r13
+ jge .L_no_extra_mask_pxhfCnBixjkllFd
+ subq %r13,%r12
+.L_no_extra_mask_pxhfCnBixjkllFd:
+
+
+
+ vmovdqu64 16(%r12),%xmm0
+ vpand %xmm0,%xmm3,%xmm3
+ vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm14,%xmm14
+ cmpq $0,%r13
+ jl .L_partial_incomplete_pxhfCnBixjkllFd
+
+ vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
+ vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
+ vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
+ vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm14,%xmm14
+
+ vpsrldq $8,%xmm14,%xmm11
+ vpslldq $8,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm7,%xmm7
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm11
+
+ vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
+ vpslldq $8,%xmm10,%xmm10
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
+ vpsrldq $4,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+
+ vpternlogq $0x96,%xmm10,%xmm7,%xmm14
+
+ movq $0,(%rdx)
+
+ movq %r11,%r12
+ movq $16,%r11
+ subq %r12,%r11
+ jmp .L_enc_dec_done_pxhfCnBixjkllFd
+
+.L_partial_incomplete_pxhfCnBixjkllFd:
+ addq %r8,(%rdx)
+ movq %r8,%r11
+
+.L_enc_dec_done_pxhfCnBixjkllFd:
+
+
+ leaq byte_len_to_mask_table(%rip),%r12
+ kmovw (%r12,%r11,2),%k1
+ vmovdqu64 %xmm14,64(%rsi)
+
+ vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
+ vpshufb %xmm5,%xmm3,%xmm3
+ movq %r9,%r12
+ vmovdqu8 %xmm3,(%r12){%k1}
+.L_partial_block_done_pxhfCnBixjkllFd:
+ vmovdqu64 0(%rsi),%xmm2
+ subq %r11,%r8
+ je .L_enc_dec_done_pdDdEbGtmhbgzzj
+ cmpq $256,%r8
+ jbe .L_message_below_equal_16_blocks_pdDdEbGtmhbgzzj
+
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vmovdqa64 ddq_addbe_4444(%rip),%zmm27
+ vmovdqa64 ddq_addbe_1234(%rip),%zmm28
+
+
+
+
+
+
+ vmovd %xmm2,%r15d
+ andl $255,%r15d
+
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpshufb %zmm29,%zmm2,%zmm2
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_mapiDClopxEitar
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_mapiDClopxEitar
+.L_next_16_overflow_mapiDClopxEitar:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_mapiDClopxEitar:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm0
+ vmovdqu8 64(%rcx,%r11,1),%zmm3
+ vmovdqu8 128(%rcx,%r11,1),%zmm4
+ vmovdqu8 192(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,0(%r10,%r11,1)
+ vmovdqu8 %zmm10,64(%r10,%r11,1)
+ vmovdqu8 %zmm11,128(%r10,%r11,1)
+ vmovdqu8 %zmm12,192(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+ vmovdqa64 %zmm7,768(%rsp)
+ vmovdqa64 %zmm10,832(%rsp)
+ vmovdqa64 %zmm11,896(%rsp)
+ vmovdqa64 %zmm12,960(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_wEgffnstFkkCiax
+
+ vmovdqu64 288(%rsi),%zmm0
+ vmovdqu64 %zmm0,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm3
+ vmovdqu64 %zmm3,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 160(%rsi),%zmm4
+ vmovdqu64 %zmm4,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm5
+ vmovdqu64 %zmm5,512(%rsp)
+.L_skip_hkeys_precomputation_wEgffnstFkkCiax:
+ cmpq $512,%r8
+ jb .L_message_below_32_blocks_pdDdEbGtmhbgzzj
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_lzgFuCogmBcsocA
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_lzgFuCogmBcsocA
+.L_next_16_overflow_lzgFuCogmBcsocA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_lzgFuCogmBcsocA:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm0
+ vmovdqu8 320(%rcx,%r11,1),%zmm3
+ vmovdqu8 384(%rcx,%r11,1),%zmm4
+ vmovdqu8 448(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,256(%r10,%r11,1)
+ vmovdqu8 %zmm10,320(%r10,%r11,1)
+ vmovdqu8 %zmm11,384(%r10,%r11,1)
+ vmovdqu8 %zmm12,448(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+ vmovdqa64 %zmm7,1024(%rsp)
+ vmovdqa64 %zmm10,1088(%rsp)
+ vmovdqa64 %zmm11,1152(%rsp)
+ vmovdqa64 %zmm12,1216(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_fxgusndxuFFGjih
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,0(%rsp)
+.L_skip_hkeys_precomputation_fxgusndxuFFGjih:
+ movq $1,%r14
+ addq $512,%r11
+ subq $512,%r8
+
+ cmpq $768,%r8
+ jb .L_no_more_big_nblocks_pdDdEbGtmhbgzzj
+.L_encrypt_big_nblocks_pdDdEbGtmhbgzzj:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_ibqhltvwwkyjEta
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_ibqhltvwwkyjEta
+.L_16_blocks_overflow_ibqhltvwwkyjEta:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_ibqhltvwwkyjEta:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_cEaavogFAbujiEy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_cEaavogFAbujiEy
+.L_16_blocks_overflow_cEaavogFAbujiEy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_cEaavogFAbujiEy:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_usjsvymwkviypdp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_usjsvymwkviypdp
+.L_16_blocks_overflow_usjsvymwkviypdp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_usjsvymwkviypdp:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 512(%rcx,%r11,1),%zmm17
+ vmovdqu8 576(%rcx,%r11,1),%zmm19
+ vmovdqu8 640(%rcx,%r11,1),%zmm20
+ vmovdqu8 704(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+
+
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpternlogq $0x96,%zmm15,%zmm12,%zmm6
+ vpxorq %zmm24,%zmm6,%zmm6
+ vpternlogq $0x96,%zmm10,%zmm13,%zmm7
+ vpxorq %zmm25,%zmm7,%zmm7
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vextracti64x4 $1,%zmm6,%ymm12
+ vpxorq %ymm12,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm12
+ vpxorq %xmm12,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm6
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,512(%r10,%r11,1)
+ vmovdqu8 %zmm3,576(%r10,%r11,1)
+ vmovdqu8 %zmm4,640(%r10,%r11,1)
+ vmovdqu8 %zmm5,704(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1024(%rsp)
+ vmovdqa64 %zmm3,1088(%rsp)
+ vmovdqa64 %zmm4,1152(%rsp)
+ vmovdqa64 %zmm5,1216(%rsp)
+ vmovdqa64 %zmm6,%zmm14
+
+ addq $768,%r11
+ subq $768,%r8
+ cmpq $768,%r8
+ jae .L_encrypt_big_nblocks_pdDdEbGtmhbgzzj
+
+.L_no_more_big_nblocks_pdDdEbGtmhbgzzj:
+
+ cmpq $512,%r8
+ jae .L_encrypt_32_blocks_pdDdEbGtmhbgzzj
+
+ cmpq $256,%r8
+ jae .L_encrypt_16_blocks_pdDdEbGtmhbgzzj
+.L_encrypt_0_blocks_ghash_32_pdDdEbGtmhbgzzj:
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $256,%ebx
+ subl %r10d,%ebx
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ addl $256,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_ikhdrkemcGbqzad
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_ikhdrkemcGbqzad
+ jb .L_last_num_blocks_is_7_1_ikhdrkemcGbqzad
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_ikhdrkemcGbqzad
+ jb .L_last_num_blocks_is_11_9_ikhdrkemcGbqzad
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_ikhdrkemcGbqzad
+ ja .L_last_num_blocks_is_16_ikhdrkemcGbqzad
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_ikhdrkemcGbqzad
+ jmp .L_last_num_blocks_is_13_ikhdrkemcGbqzad
+
+.L_last_num_blocks_is_11_9_ikhdrkemcGbqzad:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_ikhdrkemcGbqzad
+ ja .L_last_num_blocks_is_11_ikhdrkemcGbqzad
+ jmp .L_last_num_blocks_is_9_ikhdrkemcGbqzad
+
+.L_last_num_blocks_is_7_1_ikhdrkemcGbqzad:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_ikhdrkemcGbqzad
+ jb .L_last_num_blocks_is_3_1_ikhdrkemcGbqzad
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_ikhdrkemcGbqzad
+ je .L_last_num_blocks_is_6_ikhdrkemcGbqzad
+ jmp .L_last_num_blocks_is_5_ikhdrkemcGbqzad
+
+.L_last_num_blocks_is_3_1_ikhdrkemcGbqzad:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_ikhdrkemcGbqzad
+ je .L_last_num_blocks_is_2_ikhdrkemcGbqzad
+.L_last_num_blocks_is_1_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_itDorffzaCkryqj
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_itDorffzaCkryqj
+
+.L_16_blocks_overflow_itDorffzaCkryqj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_itDorffzaCkryqj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wcppwgxpbwxBCxm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wcppwgxpbwxBCxm
+.L_small_initial_partial_block_wcppwgxpbwxBCxm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_wcppwgxpbwxBCxm
+.L_small_initial_compute_done_wcppwgxpbwxBCxm:
+.L_after_reduction_wcppwgxpbwxBCxm:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_2_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_udFwtdnCnceudlw
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_udFwtdnCnceudlw
+
+.L_16_blocks_overflow_udFwtdnCnceudlw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_udFwtdnCnceudlw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pBaBAiGArbidqBv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pBaBAiGArbidqBv
+.L_small_initial_partial_block_pBaBAiGArbidqBv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pBaBAiGArbidqBv:
+
+ orq %r8,%r8
+ je .L_after_reduction_pBaBAiGArbidqBv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pBaBAiGArbidqBv:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_3_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_mnDuevixjjefvof
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_mnDuevixjjefvof
+
+.L_16_blocks_overflow_mnDuevixjjefvof:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_mnDuevixjjefvof:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yatvknGgscybvGg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yatvknGgscybvGg
+.L_small_initial_partial_block_yatvknGgscybvGg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yatvknGgscybvGg:
+
+ orq %r8,%r8
+ je .L_after_reduction_yatvknGgscybvGg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_yatvknGgscybvGg:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_4_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_vsajDEszBaAzgFt
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_vsajDEszBaAzgFt
+
+.L_16_blocks_overflow_vsajDEszBaAzgFt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_vsajDEszBaAzgFt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tchAiplfgmzAeEo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tchAiplfgmzAeEo
+.L_small_initial_partial_block_tchAiplfgmzAeEo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tchAiplfgmzAeEo:
+
+ orq %r8,%r8
+ je .L_after_reduction_tchAiplfgmzAeEo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tchAiplfgmzAeEo:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_5_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_cxtFqdnzBjmtkGn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_cxtFqdnzBjmtkGn
+
+.L_16_blocks_overflow_cxtFqdnzBjmtkGn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_cxtFqdnzBjmtkGn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EdeEenqDBtzbplp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EdeEenqDBtzbplp
+.L_small_initial_partial_block_EdeEenqDBtzbplp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EdeEenqDBtzbplp:
+
+ orq %r8,%r8
+ je .L_after_reduction_EdeEenqDBtzbplp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_EdeEenqDBtzbplp:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_6_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_jwkFAEiBkzxclcz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_jwkFAEiBkzxclcz
+
+.L_16_blocks_overflow_jwkFAEiBkzxclcz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_jwkFAEiBkzxclcz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lBhDyvvhkrxyrza
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lBhDyvvhkrxyrza
+.L_small_initial_partial_block_lBhDyvvhkrxyrza:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lBhDyvvhkrxyrza:
+
+ orq %r8,%r8
+ je .L_after_reduction_lBhDyvvhkrxyrza
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lBhDyvvhkrxyrza:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_7_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_uGexndlCfdoqjpe
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_uGexndlCfdoqjpe
+
+.L_16_blocks_overflow_uGexndlCfdoqjpe:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_uGexndlCfdoqjpe:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_Bxunmhnvmncxhcy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_Bxunmhnvmncxhcy
+.L_small_initial_partial_block_Bxunmhnvmncxhcy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_Bxunmhnvmncxhcy:
+
+ orq %r8,%r8
+ je .L_after_reduction_Bxunmhnvmncxhcy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_Bxunmhnvmncxhcy:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_8_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_vudwsyfxfgECgcf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_vudwsyfxfgECgcf
+
+.L_16_blocks_overflow_vudwsyfxfgECgcf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_vudwsyfxfgECgcf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rvqyhsdrhoanuka
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rvqyhsdrhoanuka
+.L_small_initial_partial_block_rvqyhsdrhoanuka:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rvqyhsdrhoanuka:
+
+ orq %r8,%r8
+ je .L_after_reduction_rvqyhsdrhoanuka
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rvqyhsdrhoanuka:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_9_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_mrBoGdbnxnwlkxC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_mrBoGdbnxnwlkxC
+
+.L_16_blocks_overflow_mrBoGdbnxnwlkxC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_mrBoGdbnxnwlkxC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tuyribkvmwGnBux
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tuyribkvmwGnBux
+.L_small_initial_partial_block_tuyribkvmwGnBux:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tuyribkvmwGnBux:
+
+ orq %r8,%r8
+ je .L_after_reduction_tuyribkvmwGnBux
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tuyribkvmwGnBux:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_10_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_lgaFjCbzqlskvnC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_lgaFjCbzqlskvnC
+
+.L_16_blocks_overflow_lgaFjCbzqlskvnC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_lgaFjCbzqlskvnC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_doFvvyygahavAuD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_doFvvyygahavAuD
+.L_small_initial_partial_block_doFvvyygahavAuD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_doFvvyygahavAuD:
+
+ orq %r8,%r8
+ je .L_after_reduction_doFvvyygahavAuD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_doFvvyygahavAuD:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_11_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_wnveeoCoFhnAsjr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_wnveeoCoFhnAsjr
+
+.L_16_blocks_overflow_wnveeoCoFhnAsjr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_wnveeoCoFhnAsjr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_okdqxckEysfDiGw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_okdqxckEysfDiGw
+.L_small_initial_partial_block_okdqxckEysfDiGw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_okdqxckEysfDiGw:
+
+ orq %r8,%r8
+ je .L_after_reduction_okdqxckEysfDiGw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_okdqxckEysfDiGw:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_12_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_aeCekhphkkfCGlp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_aeCekhphkkfCGlp
+
+.L_16_blocks_overflow_aeCekhphkkfCGlp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_aeCekhphkkfCGlp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tAjudiknsDunngB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tAjudiknsDunngB
+.L_small_initial_partial_block_tAjudiknsDunngB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tAjudiknsDunngB:
+
+ orq %r8,%r8
+ je .L_after_reduction_tAjudiknsDunngB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tAjudiknsDunngB:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_13_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_vFhoejiyDCGCfdw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_vFhoejiyDCGCfdw
+
+.L_16_blocks_overflow_vFhoejiyDCGCfdw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_vFhoejiyDCGCfdw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_svrobwfwdbaDnCx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_svrobwfwdbaDnCx
+.L_small_initial_partial_block_svrobwfwdbaDnCx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_svrobwfwdbaDnCx:
+
+ orq %r8,%r8
+ je .L_after_reduction_svrobwfwdbaDnCx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_svrobwfwdbaDnCx:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_14_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_hgwwfomjsnxunhr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_hgwwfomjsnxunhr
+
+.L_16_blocks_overflow_hgwwfomjsnxunhr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_hgwwfomjsnxunhr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_seAkuxixhdBEdfz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_seAkuxixhdBEdfz
+.L_small_initial_partial_block_seAkuxixhdBEdfz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_seAkuxixhdBEdfz:
+
+ orq %r8,%r8
+ je .L_after_reduction_seAkuxixhdBEdfz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_seAkuxixhdBEdfz:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_15_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_wbagfdFdigxytjj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_wbagfdFdigxytjj
+
+.L_16_blocks_overflow_wbagfdFdigxytjj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_wbagfdFdigxytjj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ChmDFBmjkjBuetv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ChmDFBmjkjBuetv
+.L_small_initial_partial_block_ChmDFBmjkjBuetv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ChmDFBmjkjBuetv:
+
+ orq %r8,%r8
+ je .L_after_reduction_ChmDFBmjkjBuetv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ChmDFBmjkjBuetv:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_16_ikhdrkemcGbqzad:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_dkuzxAGzynhzFCe
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_dkuzxAGzynhzFCe
+
+.L_16_blocks_overflow_dkuzxAGzynhzFCe:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_dkuzxAGzynhzFCe:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_vtbrvsizdbGzbGo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vtbrvsizdbGzbGo:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vtbrvsizdbGzbGo:
+ jmp .L_last_blocks_done_ikhdrkemcGbqzad
+.L_last_num_blocks_is_0_ikhdrkemcGbqzad:
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_ikhdrkemcGbqzad:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_pdDdEbGtmhbgzzj
+.L_encrypt_32_blocks_pdDdEbGtmhbgzzj:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_DpBiAfvjdcateGm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_DpBiAfvjdcateGm
+.L_16_blocks_overflow_DpBiAfvjdcateGm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_DpBiAfvjdcateGm:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_pnochsioawayaBr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_pnochsioawayaBr
+.L_16_blocks_overflow_pnochsioawayaBr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_pnochsioawayaBr:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+ subq $512,%r8
+ addq $512,%r11
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_nqBvobwmcxocojb
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_nqBvobwmcxocojb
+ jb .L_last_num_blocks_is_7_1_nqBvobwmcxocojb
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_nqBvobwmcxocojb
+ jb .L_last_num_blocks_is_11_9_nqBvobwmcxocojb
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_nqBvobwmcxocojb
+ ja .L_last_num_blocks_is_16_nqBvobwmcxocojb
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_nqBvobwmcxocojb
+ jmp .L_last_num_blocks_is_13_nqBvobwmcxocojb
+
+.L_last_num_blocks_is_11_9_nqBvobwmcxocojb:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_nqBvobwmcxocojb
+ ja .L_last_num_blocks_is_11_nqBvobwmcxocojb
+ jmp .L_last_num_blocks_is_9_nqBvobwmcxocojb
+
+.L_last_num_blocks_is_7_1_nqBvobwmcxocojb:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_nqBvobwmcxocojb
+ jb .L_last_num_blocks_is_3_1_nqBvobwmcxocojb
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_nqBvobwmcxocojb
+ je .L_last_num_blocks_is_6_nqBvobwmcxocojb
+ jmp .L_last_num_blocks_is_5_nqBvobwmcxocojb
+
+.L_last_num_blocks_is_3_1_nqBvobwmcxocojb:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_nqBvobwmcxocojb
+ je .L_last_num_blocks_is_2_nqBvobwmcxocojb
+.L_last_num_blocks_is_1_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_iGlCGEwegGzFhtA
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_iGlCGEwegGzFhtA
+
+.L_16_blocks_overflow_iGlCGEwegGzFhtA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_iGlCGEwegGzFhtA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hFBzlBjpABAteEq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hFBzlBjpABAteEq
+.L_small_initial_partial_block_hFBzlBjpABAteEq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_hFBzlBjpABAteEq
+.L_small_initial_compute_done_hFBzlBjpABAteEq:
+.L_after_reduction_hFBzlBjpABAteEq:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_2_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_BwDxojfsymCmEeo
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_BwDxojfsymCmEeo
+
+.L_16_blocks_overflow_BwDxojfsymCmEeo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_BwDxojfsymCmEeo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ujnyckFGoBmGvAD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ujnyckFGoBmGvAD
+.L_small_initial_partial_block_ujnyckFGoBmGvAD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ujnyckFGoBmGvAD:
+
+ orq %r8,%r8
+ je .L_after_reduction_ujnyckFGoBmGvAD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ujnyckFGoBmGvAD:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_3_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_ArGalqGfmEgtzdC
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_ArGalqGfmEgtzdC
+
+.L_16_blocks_overflow_ArGalqGfmEgtzdC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_ArGalqGfmEgtzdC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tlDwADlnmmFjwlt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tlDwADlnmmFjwlt
+.L_small_initial_partial_block_tlDwADlnmmFjwlt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tlDwADlnmmFjwlt:
+
+ orq %r8,%r8
+ je .L_after_reduction_tlDwADlnmmFjwlt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tlDwADlnmmFjwlt:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_4_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_eiFwyntDmEqyCDx
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_eiFwyntDmEqyCDx
+
+.L_16_blocks_overflow_eiFwyntDmEqyCDx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_eiFwyntDmEqyCDx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zAosBwqfDyjcdyb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zAosBwqfDyjcdyb
+.L_small_initial_partial_block_zAosBwqfDyjcdyb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zAosBwqfDyjcdyb:
+
+ orq %r8,%r8
+ je .L_after_reduction_zAosBwqfDyjcdyb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zAosBwqfDyjcdyb:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_5_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_bAoFucDcpblzDdt
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_bAoFucDcpblzDdt
+
+.L_16_blocks_overflow_bAoFucDcpblzDdt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_bAoFucDcpblzDdt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_icuaypakFrCovoy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_icuaypakFrCovoy
+.L_small_initial_partial_block_icuaypakFrCovoy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_icuaypakFrCovoy:
+
+ orq %r8,%r8
+ je .L_after_reduction_icuaypakFrCovoy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_icuaypakFrCovoy:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_6_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_nBxnDvEEtcfmmpA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_nBxnDvEEtcfmmpA
+
+.L_16_blocks_overflow_nBxnDvEEtcfmmpA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_nBxnDvEEtcfmmpA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oBDgqvmqflGBdts
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oBDgqvmqflGBdts
+.L_small_initial_partial_block_oBDgqvmqflGBdts:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oBDgqvmqflGBdts:
+
+ orq %r8,%r8
+ je .L_after_reduction_oBDgqvmqflGBdts
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oBDgqvmqflGBdts:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_7_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_ktiEwgDjzbqnlgA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_ktiEwgDjzbqnlgA
+
+.L_16_blocks_overflow_ktiEwgDjzbqnlgA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_ktiEwgDjzbqnlgA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rhqzwAqatoAowvt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rhqzwAqatoAowvt
+.L_small_initial_partial_block_rhqzwAqatoAowvt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rhqzwAqatoAowvt:
+
+ orq %r8,%r8
+ je .L_after_reduction_rhqzwAqatoAowvt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rhqzwAqatoAowvt:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_8_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_ppdpbjvaqFskcDy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_ppdpbjvaqFskcDy
+
+.L_16_blocks_overflow_ppdpbjvaqFskcDy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_ppdpbjvaqFskcDy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hghryxmwctxcEsx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hghryxmwctxcEsx
+.L_small_initial_partial_block_hghryxmwctxcEsx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hghryxmwctxcEsx:
+
+ orq %r8,%r8
+ je .L_after_reduction_hghryxmwctxcEsx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hghryxmwctxcEsx:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_9_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_ssqyutccxCiqEfp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_ssqyutccxCiqEfp
+
+.L_16_blocks_overflow_ssqyutccxCiqEfp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_ssqyutccxCiqEfp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dkgcmoCccqwinCj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dkgcmoCccqwinCj
+.L_small_initial_partial_block_dkgcmoCccqwinCj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dkgcmoCccqwinCj:
+
+ orq %r8,%r8
+ je .L_after_reduction_dkgcmoCccqwinCj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dkgcmoCccqwinCj:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_10_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_qrrfwGAzztwabql
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_qrrfwGAzztwabql
+
+.L_16_blocks_overflow_qrrfwGAzztwabql:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_qrrfwGAzztwabql:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ioCDffAzuDvuFmD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ioCDffAzuDvuFmD
+.L_small_initial_partial_block_ioCDffAzuDvuFmD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ioCDffAzuDvuFmD:
+
+ orq %r8,%r8
+ je .L_after_reduction_ioCDffAzuDvuFmD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ioCDffAzuDvuFmD:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_11_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_BFnbwbbsiwGDDCn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_BFnbwbbsiwGDDCn
+
+.L_16_blocks_overflow_BFnbwbbsiwGDDCn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_BFnbwbbsiwGDDCn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cCoGeiFGozAwFew
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cCoGeiFGozAwFew
+.L_small_initial_partial_block_cCoGeiFGozAwFew:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cCoGeiFGozAwFew:
+
+ orq %r8,%r8
+ je .L_after_reduction_cCoGeiFGozAwFew
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_cCoGeiFGozAwFew:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_12_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_haBiqFbjgxpdzpn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_haBiqFbjgxpdzpn
+
+.L_16_blocks_overflow_haBiqFbjgxpdzpn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_haBiqFbjgxpdzpn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nhbrtEjyiFhswCq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nhbrtEjyiFhswCq
+.L_small_initial_partial_block_nhbrtEjyiFhswCq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_nhbrtEjyiFhswCq:
+
+ orq %r8,%r8
+ je .L_after_reduction_nhbrtEjyiFhswCq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_nhbrtEjyiFhswCq:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_13_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_aDaGBFBAaojGGGj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_aDaGBFBAaojGGGj
+
+.L_16_blocks_overflow_aDaGBFBAaojGGGj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_aDaGBFBAaojGGGj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mozkzBtivrcvtEk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mozkzBtivrcvtEk
+.L_small_initial_partial_block_mozkzBtivrcvtEk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mozkzBtivrcvtEk:
+
+ orq %r8,%r8
+ je .L_after_reduction_mozkzBtivrcvtEk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_mozkzBtivrcvtEk:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_14_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_tAnEojledvrxyjr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_tAnEojledvrxyjr
+
+.L_16_blocks_overflow_tAnEojledvrxyjr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_tAnEojledvrxyjr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FdkjoDukspwasBA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FdkjoDukspwasBA
+.L_small_initial_partial_block_FdkjoDukspwasBA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FdkjoDukspwasBA:
+
+ orq %r8,%r8
+ je .L_after_reduction_FdkjoDukspwasBA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FdkjoDukspwasBA:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_15_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_EocAcwAEiGzmbor
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_EocAcwAEiGzmbor
+
+.L_16_blocks_overflow_EocAcwAEiGzmbor:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_EocAcwAEiGzmbor:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ioeijxfuGydnlim
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ioeijxfuGydnlim
+.L_small_initial_partial_block_ioeijxfuGydnlim:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ioeijxfuGydnlim:
+
+ orq %r8,%r8
+ je .L_after_reduction_ioeijxfuGydnlim
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ioeijxfuGydnlim:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_16_nqBvobwmcxocojb:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_uDqoqnyAqaujFth
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_uDqoqnyAqaujFth
+
+.L_16_blocks_overflow_uDqoqnyAqaujFth:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_uDqoqnyAqaujFth:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_rpjttlmmCtxqtrD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rpjttlmmCtxqtrD:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rpjttlmmCtxqtrD:
+ jmp .L_last_blocks_done_nqBvobwmcxocojb
+.L_last_num_blocks_is_0_nqBvobwmcxocojb:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_nqBvobwmcxocojb:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_pdDdEbGtmhbgzzj
+.L_encrypt_16_blocks_pdDdEbGtmhbgzzj:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_mlfnqsfcdbpAAfz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_mlfnqsfcdbpAAfz
+.L_16_blocks_overflow_mlfnqsfcdbpAAfz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_mlfnqsfcdbpAAfz:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 256(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 320(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 384(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 448(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_hommwsmBDghhsCD
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_hommwsmBDghhsCD
+ jb .L_last_num_blocks_is_7_1_hommwsmBDghhsCD
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_hommwsmBDghhsCD
+ jb .L_last_num_blocks_is_11_9_hommwsmBDghhsCD
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_hommwsmBDghhsCD
+ ja .L_last_num_blocks_is_16_hommwsmBDghhsCD
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_hommwsmBDghhsCD
+ jmp .L_last_num_blocks_is_13_hommwsmBDghhsCD
+
+.L_last_num_blocks_is_11_9_hommwsmBDghhsCD:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_hommwsmBDghhsCD
+ ja .L_last_num_blocks_is_11_hommwsmBDghhsCD
+ jmp .L_last_num_blocks_is_9_hommwsmBDghhsCD
+
+.L_last_num_blocks_is_7_1_hommwsmBDghhsCD:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_hommwsmBDghhsCD
+ jb .L_last_num_blocks_is_3_1_hommwsmBDghhsCD
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_hommwsmBDghhsCD
+ je .L_last_num_blocks_is_6_hommwsmBDghhsCD
+ jmp .L_last_num_blocks_is_5_hommwsmBDghhsCD
+
+.L_last_num_blocks_is_3_1_hommwsmBDghhsCD:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_hommwsmBDghhsCD
+ je .L_last_num_blocks_is_2_hommwsmBDghhsCD
+.L_last_num_blocks_is_1_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_mgEtuxommfhprEy
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_mgEtuxommfhprEy
+
+.L_16_blocks_overflow_mgEtuxommfhprEy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_mgEtuxommfhprEy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %xmm31,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hmAEtdvbxtuofqt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hmAEtdvbxtuofqt
+.L_small_initial_partial_block_hmAEtdvbxtuofqt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_hmAEtdvbxtuofqt
+.L_small_initial_compute_done_hmAEtdvbxtuofqt:
+.L_after_reduction_hmAEtdvbxtuofqt:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_2_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_eunligEgprqxzEB
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_eunligEgprqxzEB
+
+.L_16_blocks_overflow_eunligEgprqxzEB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_eunligEgprqxzEB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %ymm31,%ymm0,%ymm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CpCtmyiCpxeyqBF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CpCtmyiCpxeyqBF
+.L_small_initial_partial_block_CpCtmyiCpxeyqBF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CpCtmyiCpxeyqBF:
+
+ orq %r8,%r8
+ je .L_after_reduction_CpCtmyiCpxeyqBF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CpCtmyiCpxeyqBF:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_3_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_tCygkraciCitCxE
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_tCygkraciCitCxE
+
+.L_16_blocks_overflow_tCygkraciCitCxE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_tCygkraciCitCxE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oscyleCtgoefssq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oscyleCtgoefssq
+.L_small_initial_partial_block_oscyleCtgoefssq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oscyleCtgoefssq:
+
+ orq %r8,%r8
+ je .L_after_reduction_oscyleCtgoefssq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oscyleCtgoefssq:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_4_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_nkuGqpqvsuAfkpy
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_nkuGqpqvsuAfkpy
+
+.L_16_blocks_overflow_nkuGqpqvsuAfkpy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_nkuGqpqvsuAfkpy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bszjeCzlpihayrq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bszjeCzlpihayrq
+.L_small_initial_partial_block_bszjeCzlpihayrq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bszjeCzlpihayrq:
+
+ orq %r8,%r8
+ je .L_after_reduction_bszjeCzlpihayrq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bszjeCzlpihayrq:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_5_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_yBohCFkvcahhcEE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_yBohCFkvcahhcEE
+
+.L_16_blocks_overflow_yBohCFkvcahhcEE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_yBohCFkvcahhcEE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_peyrCumyCvjyexD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_peyrCumyCvjyexD
+.L_small_initial_partial_block_peyrCumyCvjyexD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_peyrCumyCvjyexD:
+
+ orq %r8,%r8
+ je .L_after_reduction_peyrCumyCvjyexD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_peyrCumyCvjyexD:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_6_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_efCkGsdFqsctEDl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_efCkGsdFqsctEDl
+
+.L_16_blocks_overflow_efCkGsdFqsctEDl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_efCkGsdFqsctEDl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nolBDipDBhtrDmb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nolBDipDBhtrDmb
+.L_small_initial_partial_block_nolBDipDBhtrDmb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_nolBDipDBhtrDmb:
+
+ orq %r8,%r8
+ je .L_after_reduction_nolBDipDBhtrDmb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_nolBDipDBhtrDmb:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_7_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_uGpnccromgjsdor
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_uGpnccromgjsdor
+
+.L_16_blocks_overflow_uGpnccromgjsdor:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_uGpnccromgjsdor:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wFFpDbecxxomBhl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wFFpDbecxxomBhl
+.L_small_initial_partial_block_wFFpDbecxxomBhl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wFFpDbecxxomBhl:
+
+ orq %r8,%r8
+ je .L_after_reduction_wFFpDbecxxomBhl
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wFFpDbecxxomBhl:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_8_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_rCxvxGCqotFabFi
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_rCxvxGCqotFabFi
+
+.L_16_blocks_overflow_rCxvxGCqotFabFi:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_rCxvxGCqotFabFi:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GfamjmilndFvzhv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GfamjmilndFvzhv
+.L_small_initial_partial_block_GfamjmilndFvzhv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GfamjmilndFvzhv:
+
+ orq %r8,%r8
+ je .L_after_reduction_GfamjmilndFvzhv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_GfamjmilndFvzhv:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_9_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_ycGahwjqkughsCy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_ycGahwjqkughsCy
+
+.L_16_blocks_overflow_ycGahwjqkughsCy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_ycGahwjqkughsCy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oodBdsqrimpGlcx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oodBdsqrimpGlcx
+.L_small_initial_partial_block_oodBdsqrimpGlcx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oodBdsqrimpGlcx:
+
+ orq %r8,%r8
+ je .L_after_reduction_oodBdsqrimpGlcx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oodBdsqrimpGlcx:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_10_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_qvAdocAzEtlnyGa
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_qvAdocAzEtlnyGa
+
+.L_16_blocks_overflow_qvAdocAzEtlnyGa:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_qvAdocAzEtlnyGa:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sDpafzbwGCbyCCy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sDpafzbwGCbyCCy
+.L_small_initial_partial_block_sDpafzbwGCbyCCy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sDpafzbwGCbyCCy:
+
+ orq %r8,%r8
+ je .L_after_reduction_sDpafzbwGCbyCCy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_sDpafzbwGCbyCCy:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_11_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_BGwcgjgblbFBkyn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_BGwcgjgblbFBkyn
+
+.L_16_blocks_overflow_BGwcgjgblbFBkyn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_BGwcgjgblbFBkyn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oDmcaDazcjvlCqo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oDmcaDazcjvlCqo
+.L_small_initial_partial_block_oDmcaDazcjvlCqo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oDmcaDazcjvlCqo:
+
+ orq %r8,%r8
+ je .L_after_reduction_oDmcaDazcjvlCqo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oDmcaDazcjvlCqo:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_12_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_ooGtexyxfikBFDA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_ooGtexyxfikBFDA
+
+.L_16_blocks_overflow_ooGtexyxfikBFDA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_ooGtexyxfikBFDA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hawFrugxuDsFkwh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hawFrugxuDsFkwh
+.L_small_initial_partial_block_hawFrugxuDsFkwh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hawFrugxuDsFkwh:
+
+ orq %r8,%r8
+ je .L_after_reduction_hawFrugxuDsFkwh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hawFrugxuDsFkwh:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_13_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_ffjezAuFCnhGagx
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_ffjezAuFCnhGagx
+
+.L_16_blocks_overflow_ffjezAuFCnhGagx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_ffjezAuFCnhGagx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nszsngmcgAavfgo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nszsngmcgAavfgo
+.L_small_initial_partial_block_nszsngmcgAavfgo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_nszsngmcgAavfgo:
+
+ orq %r8,%r8
+ je .L_after_reduction_nszsngmcgAavfgo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_nszsngmcgAavfgo:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_14_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_CfdCFDnjwhDDuze
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_CfdCFDnjwhDDuze
+
+.L_16_blocks_overflow_CfdCFDnjwhDDuze:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_CfdCFDnjwhDDuze:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nnhzacbBeBgBwss
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nnhzacbBeBgBwss
+.L_small_initial_partial_block_nnhzacbBeBgBwss:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_nnhzacbBeBgBwss:
+
+ orq %r8,%r8
+ je .L_after_reduction_nnhzacbBeBgBwss
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_nnhzacbBeBgBwss:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_15_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_GAcGndzbDEvCwfz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_GAcGndzbDEvCwfz
+
+.L_16_blocks_overflow_GAcGndzbDEvCwfz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_GAcGndzbDEvCwfz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kpsoetidpdjlnwh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kpsoetidpdjlnwh
+.L_small_initial_partial_block_kpsoetidpdjlnwh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kpsoetidpdjlnwh:
+
+ orq %r8,%r8
+ je .L_after_reduction_kpsoetidpdjlnwh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_kpsoetidpdjlnwh:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_16_hommwsmBDghhsCD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_wpowiymzckfpmlc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_wpowiymzckfpmlc
+
+.L_16_blocks_overflow_wpowiymzckfpmlc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_wpowiymzckfpmlc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_xjewDEdrojAwizl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xjewDEdrojAwizl:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xjewDEdrojAwizl:
+ jmp .L_last_blocks_done_hommwsmBDghhsCD
+.L_last_num_blocks_is_0_hommwsmBDghhsCD:
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_hommwsmBDghhsCD:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_pdDdEbGtmhbgzzj
+
+.L_message_below_32_blocks_pdDdEbGtmhbgzzj:
+
+
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_zxFmdGhwegjCAGr
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+.L_skip_hkeys_precomputation_zxFmdGhwegjCAGr:
+ movq $1,%r14
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_yEtjCjlkazyuxae
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_yEtjCjlkazyuxae
+ jb .L_last_num_blocks_is_7_1_yEtjCjlkazyuxae
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_yEtjCjlkazyuxae
+ jb .L_last_num_blocks_is_11_9_yEtjCjlkazyuxae
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_yEtjCjlkazyuxae
+ ja .L_last_num_blocks_is_16_yEtjCjlkazyuxae
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_yEtjCjlkazyuxae
+ jmp .L_last_num_blocks_is_13_yEtjCjlkazyuxae
+
+.L_last_num_blocks_is_11_9_yEtjCjlkazyuxae:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_yEtjCjlkazyuxae
+ ja .L_last_num_blocks_is_11_yEtjCjlkazyuxae
+ jmp .L_last_num_blocks_is_9_yEtjCjlkazyuxae
+
+.L_last_num_blocks_is_7_1_yEtjCjlkazyuxae:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_yEtjCjlkazyuxae
+ jb .L_last_num_blocks_is_3_1_yEtjCjlkazyuxae
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_yEtjCjlkazyuxae
+ je .L_last_num_blocks_is_6_yEtjCjlkazyuxae
+ jmp .L_last_num_blocks_is_5_yEtjCjlkazyuxae
+
+.L_last_num_blocks_is_3_1_yEtjCjlkazyuxae:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_yEtjCjlkazyuxae
+ je .L_last_num_blocks_is_2_yEtjCjlkazyuxae
+.L_last_num_blocks_is_1_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_GemCxiwxneizpok
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_GemCxiwxneizpok
+
+.L_16_blocks_overflow_GemCxiwxneizpok:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_GemCxiwxneizpok:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lDxtxBkDCvCDeAu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lDxtxBkDCvCDeAu
+.L_small_initial_partial_block_lDxtxBkDCvCDeAu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_lDxtxBkDCvCDeAu
+.L_small_initial_compute_done_lDxtxBkDCvCDeAu:
+.L_after_reduction_lDxtxBkDCvCDeAu:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_2_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_mtbzanedDzblhBt
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_mtbzanedDzblhBt
+
+.L_16_blocks_overflow_mtbzanedDzblhBt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_mtbzanedDzblhBt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vDfEzdpCaoutqpk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vDfEzdpCaoutqpk
+.L_small_initial_partial_block_vDfEzdpCaoutqpk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vDfEzdpCaoutqpk:
+
+ orq %r8,%r8
+ je .L_after_reduction_vDfEzdpCaoutqpk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vDfEzdpCaoutqpk:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_3_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_dEDrjDhcyydvacb
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_dEDrjDhcyydvacb
+
+.L_16_blocks_overflow_dEDrjDhcyydvacb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_dEDrjDhcyydvacb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ulcxboFccGvxqoA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ulcxboFccGvxqoA
+.L_small_initial_partial_block_ulcxboFccGvxqoA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ulcxboFccGvxqoA:
+
+ orq %r8,%r8
+ je .L_after_reduction_ulcxboFccGvxqoA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ulcxboFccGvxqoA:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_4_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_oDxtFmsewqDacsh
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_oDxtFmsewqDacsh
+
+.L_16_blocks_overflow_oDxtFmsewqDacsh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_oDxtFmsewqDacsh:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vugvwEfszCpbGFf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vugvwEfszCpbGFf
+.L_small_initial_partial_block_vugvwEfszCpbGFf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vugvwEfszCpbGFf:
+
+ orq %r8,%r8
+ je .L_after_reduction_vugvwEfszCpbGFf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vugvwEfszCpbGFf:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_5_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_xkcGkGACdgyhfnk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_xkcGkGACdgyhfnk
+
+.L_16_blocks_overflow_xkcGkGACdgyhfnk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_xkcGkGACdgyhfnk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ztfihBbCfBvyfov
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ztfihBbCfBvyfov
+.L_small_initial_partial_block_ztfihBbCfBvyfov:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ztfihBbCfBvyfov:
+
+ orq %r8,%r8
+ je .L_after_reduction_ztfihBbCfBvyfov
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ztfihBbCfBvyfov:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_6_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_xlFpBxEfzmCmemF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_xlFpBxEfzmCmemF
+
+.L_16_blocks_overflow_xlFpBxEfzmCmemF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_xlFpBxEfzmCmemF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lxGrFedjGdoqthf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lxGrFedjGdoqthf
+.L_small_initial_partial_block_lxGrFedjGdoqthf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lxGrFedjGdoqthf:
+
+ orq %r8,%r8
+ je .L_after_reduction_lxGrFedjGdoqthf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lxGrFedjGdoqthf:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_7_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_epvGyiwrthhFeDk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_epvGyiwrthhFeDk
+
+.L_16_blocks_overflow_epvGyiwrthhFeDk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_epvGyiwrthhFeDk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lDmxfclvwFuFuGn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lDmxfclvwFuFuGn
+.L_small_initial_partial_block_lDmxfclvwFuFuGn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lDmxfclvwFuFuGn:
+
+ orq %r8,%r8
+ je .L_after_reduction_lDmxfclvwFuFuGn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lDmxfclvwFuFuGn:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_8_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_rlpnCjhhrhBjnBv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_rlpnCjhhrhBjnBv
+
+.L_16_blocks_overflow_rlpnCjhhrhBjnBv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_rlpnCjhhrhBjnBv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wCmlnxlmuAqfmku
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wCmlnxlmuAqfmku
+.L_small_initial_partial_block_wCmlnxlmuAqfmku:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wCmlnxlmuAqfmku:
+
+ orq %r8,%r8
+ je .L_after_reduction_wCmlnxlmuAqfmku
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wCmlnxlmuAqfmku:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_9_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_xGcqvoGCBlCvFjF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_xGcqvoGCBlCvFjF
+
+.L_16_blocks_overflow_xGcqvoGCBlCvFjF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_xGcqvoGCBlCvFjF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uoAmEEFbAhessra
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uoAmEEFbAhessra
+.L_small_initial_partial_block_uoAmEEFbAhessra:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uoAmEEFbAhessra:
+
+ orq %r8,%r8
+ je .L_after_reduction_uoAmEEFbAhessra
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uoAmEEFbAhessra:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_10_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_lxwlEahBzykFvop
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_lxwlEahBzykFvop
+
+.L_16_blocks_overflow_lxwlEahBzykFvop:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_lxwlEahBzykFvop:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ymGqwwcaDlhrzht
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ymGqwwcaDlhrzht
+.L_small_initial_partial_block_ymGqwwcaDlhrzht:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ymGqwwcaDlhrzht:
+
+ orq %r8,%r8
+ je .L_after_reduction_ymGqwwcaDlhrzht
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ymGqwwcaDlhrzht:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_11_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_DwphDuBmGjsjgos
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_DwphDuBmGjsjgos
+
+.L_16_blocks_overflow_DwphDuBmGjsjgos:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_DwphDuBmGjsjgos:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_feadFtsqxgxipCv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_feadFtsqxgxipCv
+.L_small_initial_partial_block_feadFtsqxgxipCv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_feadFtsqxgxipCv:
+
+ orq %r8,%r8
+ je .L_after_reduction_feadFtsqxgxipCv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_feadFtsqxgxipCv:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_12_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_tysgGmlzxDCuchk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_tysgGmlzxDCuchk
+
+.L_16_blocks_overflow_tysgGmlzxDCuchk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_tysgGmlzxDCuchk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jdvGApyCGfzBhpb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jdvGApyCGfzBhpb
+.L_small_initial_partial_block_jdvGApyCGfzBhpb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jdvGApyCGfzBhpb:
+
+ orq %r8,%r8
+ je .L_after_reduction_jdvGApyCGfzBhpb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jdvGApyCGfzBhpb:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_13_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_halbrdjstkvuogl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_halbrdjstkvuogl
+
+.L_16_blocks_overflow_halbrdjstkvuogl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_halbrdjstkvuogl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pdxowiCmkqsedqs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pdxowiCmkqsedqs
+.L_small_initial_partial_block_pdxowiCmkqsedqs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pdxowiCmkqsedqs:
+
+ orq %r8,%r8
+ je .L_after_reduction_pdxowiCmkqsedqs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pdxowiCmkqsedqs:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_14_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_qlykidCbnDmCaom
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_qlykidCbnDmCaom
+
+.L_16_blocks_overflow_qlykidCbnDmCaom:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_qlykidCbnDmCaom:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bCGuxGwffFmkxlq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bCGuxGwffFmkxlq
+.L_small_initial_partial_block_bCGuxGwffFmkxlq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bCGuxGwffFmkxlq:
+
+ orq %r8,%r8
+ je .L_after_reduction_bCGuxGwffFmkxlq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bCGuxGwffFmkxlq:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_15_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_tvonowlqiEmbpqm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_tvonowlqiEmbpqm
+
+.L_16_blocks_overflow_tvonowlqiEmbpqm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_tvonowlqiEmbpqm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dlvvxnvpiqivacr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dlvvxnvpiqivacr
+.L_small_initial_partial_block_dlvvxnvpiqivacr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dlvvxnvpiqivacr:
+
+ orq %r8,%r8
+ je .L_after_reduction_dlvvxnvpiqivacr
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dlvvxnvpiqivacr:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_16_yEtjCjlkazyuxae:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_nqzepvdnfxxrztt
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_nqzepvdnfxxrztt
+
+.L_16_blocks_overflow_nqzepvdnfxxrztt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_nqzepvdnfxxrztt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_bBybkCcjjhhjGnD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bBybkCcjjhhjGnD:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bBybkCcjjhhjGnD:
+ jmp .L_last_blocks_done_yEtjCjlkazyuxae
+.L_last_num_blocks_is_0_yEtjCjlkazyuxae:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_yEtjCjlkazyuxae:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_pdDdEbGtmhbgzzj
+
+.L_message_below_equal_16_blocks_pdDdEbGtmhbgzzj:
+
+
+ movl %r8d,%r12d
+ addl $15,%r12d
+ shrl $4,%r12d
+ cmpq $8,%r12
+ je .L_small_initial_num_blocks_is_8_ewuGsEvelaCkirh
+ jl .L_small_initial_num_blocks_is_7_1_ewuGsEvelaCkirh
+
+
+ cmpq $12,%r12
+ je .L_small_initial_num_blocks_is_12_ewuGsEvelaCkirh
+ jl .L_small_initial_num_blocks_is_11_9_ewuGsEvelaCkirh
+
+
+ cmpq $16,%r12
+ je .L_small_initial_num_blocks_is_16_ewuGsEvelaCkirh
+ cmpq $15,%r12
+ je .L_small_initial_num_blocks_is_15_ewuGsEvelaCkirh
+ cmpq $14,%r12
+ je .L_small_initial_num_blocks_is_14_ewuGsEvelaCkirh
+ jmp .L_small_initial_num_blocks_is_13_ewuGsEvelaCkirh
+
+.L_small_initial_num_blocks_is_11_9_ewuGsEvelaCkirh:
+
+ cmpq $11,%r12
+ je .L_small_initial_num_blocks_is_11_ewuGsEvelaCkirh
+ cmpq $10,%r12
+ je .L_small_initial_num_blocks_is_10_ewuGsEvelaCkirh
+ jmp .L_small_initial_num_blocks_is_9_ewuGsEvelaCkirh
+
+.L_small_initial_num_blocks_is_7_1_ewuGsEvelaCkirh:
+ cmpq $4,%r12
+ je .L_small_initial_num_blocks_is_4_ewuGsEvelaCkirh
+ jl .L_small_initial_num_blocks_is_3_1_ewuGsEvelaCkirh
+
+ cmpq $7,%r12
+ je .L_small_initial_num_blocks_is_7_ewuGsEvelaCkirh
+ cmpq $6,%r12
+ je .L_small_initial_num_blocks_is_6_ewuGsEvelaCkirh
+ jmp .L_small_initial_num_blocks_is_5_ewuGsEvelaCkirh
+
+.L_small_initial_num_blocks_is_3_1_ewuGsEvelaCkirh:
+
+ cmpq $3,%r12
+ je .L_small_initial_num_blocks_is_3_ewuGsEvelaCkirh
+ cmpq $2,%r12
+ je .L_small_initial_num_blocks_is_2_ewuGsEvelaCkirh
+
+
+
+
+
+.L_small_initial_num_blocks_is_1_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%xmm29
+ vpaddd ONE(%rip),%xmm2,%xmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vpshufb %xmm29,%xmm0,%xmm0
+ vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %xmm15,%xmm0,%xmm0
+ vpxorq %xmm6,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm6
+ vextracti32x4 $0,%zmm6,%xmm13
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qAfhfumcaDjruco
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qAfhfumcaDjruco
+.L_small_initial_partial_block_qAfhfumcaDjruco:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm13,%xmm14,%xmm14
+
+ jmp .L_after_reduction_qAfhfumcaDjruco
+.L_small_initial_compute_done_qAfhfumcaDjruco:
+.L_after_reduction_qAfhfumcaDjruco:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_2_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%ymm29
+ vshufi64x2 $0,%ymm2,%ymm2,%ymm0
+ vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vpshufb %ymm29,%ymm0,%ymm0
+ vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %ymm15,%ymm0,%ymm0
+ vpxorq %ymm6,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm6
+ vextracti32x4 $1,%zmm6,%xmm13
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ubuBFaxsGrnemfF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ubuBFaxsGrnemfF
+.L_small_initial_partial_block_ubuBFaxsGrnemfF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ubuBFaxsGrnemfF:
+
+ orq %r8,%r8
+ je .L_after_reduction_ubuBFaxsGrnemfF
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ubuBFaxsGrnemfF:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_3_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vextracti32x4 $2,%zmm6,%xmm13
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ndaAlsscEjpEkoq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ndaAlsscEjpEkoq
+.L_small_initial_partial_block_ndaAlsscEjpEkoq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ndaAlsscEjpEkoq:
+
+ orq %r8,%r8
+ je .L_after_reduction_ndaAlsscEjpEkoq
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ndaAlsscEjpEkoq:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_4_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vextracti32x4 $3,%zmm6,%xmm13
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jktiGoAbGDiFkaq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jktiGoAbGDiFkaq
+.L_small_initial_partial_block_jktiGoAbGDiFkaq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jktiGoAbGDiFkaq:
+
+ orq %r8,%r8
+ je .L_after_reduction_jktiGoAbGDiFkaq
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_jktiGoAbGDiFkaq:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_5_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %xmm7,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %xmm29,%xmm3,%xmm7
+ vextracti32x4 $0,%zmm7,%xmm13
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sEqEFsxphmltbmr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sEqEFsxphmltbmr
+.L_small_initial_partial_block_sEqEFsxphmltbmr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sEqEFsxphmltbmr:
+
+ orq %r8,%r8
+ je .L_after_reduction_sEqEFsxphmltbmr
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_sEqEFsxphmltbmr:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_6_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %ymm15,%ymm3,%ymm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %ymm7,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %ymm29,%ymm3,%ymm7
+ vextracti32x4 $1,%zmm7,%xmm13
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_slpocbFrpsoiAib
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_slpocbFrpsoiAib
+.L_small_initial_partial_block_slpocbFrpsoiAib:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_slpocbFrpsoiAib:
+
+ orq %r8,%r8
+ je .L_after_reduction_slpocbFrpsoiAib
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_slpocbFrpsoiAib:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_7_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vextracti32x4 $2,%zmm7,%xmm13
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EEknGefGCzrkolw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EEknGefGCzrkolw
+.L_small_initial_partial_block_EEknGefGCzrkolw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EEknGefGCzrkolw:
+
+ orq %r8,%r8
+ je .L_after_reduction_EEknGefGCzrkolw
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_EEknGefGCzrkolw:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_8_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vextracti32x4 $3,%zmm7,%xmm13
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qrgmfxpdazygeCe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qrgmfxpdazygeCe
+.L_small_initial_partial_block_qrgmfxpdazygeCe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qrgmfxpdazygeCe:
+
+ orq %r8,%r8
+ je .L_after_reduction_qrgmfxpdazygeCe
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_qrgmfxpdazygeCe:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_9_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %xmm10,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %xmm29,%xmm4,%xmm10
+ vextracti32x4 $0,%zmm10,%xmm13
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ixdohjdwtejkAah
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ixdohjdwtejkAah
+.L_small_initial_partial_block_ixdohjdwtejkAah:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ixdohjdwtejkAah:
+
+ orq %r8,%r8
+ je .L_after_reduction_ixdohjdwtejkAah
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ixdohjdwtejkAah:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_10_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %ymm15,%ymm4,%ymm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %ymm10,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %ymm29,%ymm4,%ymm10
+ vextracti32x4 $1,%zmm10,%xmm13
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kdvEyrakCtlldFt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kdvEyrakCtlldFt
+.L_small_initial_partial_block_kdvEyrakCtlldFt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kdvEyrakCtlldFt:
+
+ orq %r8,%r8
+ je .L_after_reduction_kdvEyrakCtlldFt
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_kdvEyrakCtlldFt:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_11_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vextracti32x4 $2,%zmm10,%xmm13
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cutxzwGkeBggDqx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cutxzwGkeBggDqx
+.L_small_initial_partial_block_cutxzwGkeBggDqx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cutxzwGkeBggDqx:
+
+ orq %r8,%r8
+ je .L_after_reduction_cutxzwGkeBggDqx
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_cutxzwGkeBggDqx:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_12_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vextracti32x4 $3,%zmm10,%xmm13
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oqFnyhhlpeztanE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oqFnyhhlpeztanE
+.L_small_initial_partial_block_oqFnyhhlpeztanE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oqFnyhhlpeztanE:
+
+ orq %r8,%r8
+ je .L_after_reduction_oqFnyhhlpeztanE
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_oqFnyhhlpeztanE:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_13_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %xmm11,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %xmm29,%xmm5,%xmm11
+ vextracti32x4 $0,%zmm11,%xmm13
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mloEfjmpzzECCFk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mloEfjmpzzECCFk
+.L_small_initial_partial_block_mloEfjmpzzECCFk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mloEfjmpzzECCFk:
+
+ orq %r8,%r8
+ je .L_after_reduction_mloEfjmpzzECCFk
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_mloEfjmpzzECCFk:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_14_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %ymm15,%ymm5,%ymm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %ymm11,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %ymm29,%ymm5,%ymm11
+ vextracti32x4 $1,%zmm11,%xmm13
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lokFbqCpdpswyxF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lokFbqCpdpswyxF
+.L_small_initial_partial_block_lokFbqCpdpswyxF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lokFbqCpdpswyxF:
+
+ orq %r8,%r8
+ je .L_after_reduction_lokFbqCpdpswyxF
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_lokFbqCpdpswyxF:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_15_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %zmm29,%zmm5,%zmm11
+ vextracti32x4 $2,%zmm11,%xmm13
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bmnsCorxdnheyAb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bmnsCorxdnheyAb
+.L_small_initial_partial_block_bmnsCorxdnheyAb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bmnsCorxdnheyAb:
+
+ orq %r8,%r8
+ je .L_after_reduction_bmnsCorxdnheyAb
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_bmnsCorxdnheyAb:
+ jmp .L_small_initial_blocks_encrypted_ewuGsEvelaCkirh
+.L_small_initial_num_blocks_is_16_ewuGsEvelaCkirh:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %zmm29,%zmm5,%zmm11
+ vextracti32x4 $3,%zmm11,%xmm13
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_iGnlhalqoGhdkbv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_iGnlhalqoGhdkbv:
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_iGnlhalqoGhdkbv:
+.L_small_initial_blocks_encrypted_ewuGsEvelaCkirh:
+.L_ghash_done_pdDdEbGtmhbgzzj:
+ vmovdqu64 %xmm2,0(%rsi)
+ vmovdqu64 %xmm14,64(%rsi)
+.L_enc_dec_done_pdDdEbGtmhbgzzj:
+ jmp .Lexit_gcm_encrypt
+.align 32
+.Laes_gcm_encrypt_192_avx512:
+ orq %r8,%r8
+ je .L_enc_dec_done_tFbkipsuzBAeEGF
+ xorq %r14,%r14
+ vmovdqu64 64(%rsi),%xmm14
+
+ movq (%rdx),%r11
+ orq %r11,%r11
+ je .L_partial_block_done_jdCiCmGpmghGfDo
+ movl $16,%r10d
+ leaq byte_len_to_mask_table(%rip),%r12
+ cmpq %r10,%r8
+ cmovcq %r8,%r10
+ kmovw (%r12,%r10,2),%k1
+ vmovdqu8 (%rcx),%xmm0{%k1}{z}
+
+ vmovdqu64 16(%rsi),%xmm3
+ vmovdqu64 336(%rsi),%xmm4
+
+
+
+ leaq SHIFT_MASK(%rip),%r12
+ addq %r11,%r12
+ vmovdqu64 (%r12),%xmm5
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpxorq %xmm0,%xmm3,%xmm3
+
+
+ leaq (%r8,%r11,1),%r13
+ subq $16,%r13
+ jge .L_no_extra_mask_jdCiCmGpmghGfDo
+ subq %r13,%r12
+.L_no_extra_mask_jdCiCmGpmghGfDo:
+
+
+
+ vmovdqu64 16(%r12),%xmm0
+ vpand %xmm0,%xmm3,%xmm3
+ vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm14,%xmm14
+ cmpq $0,%r13
+ jl .L_partial_incomplete_jdCiCmGpmghGfDo
+
+ vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
+ vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
+ vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
+ vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm14,%xmm14
+
+ vpsrldq $8,%xmm14,%xmm11
+ vpslldq $8,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm7,%xmm7
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm11
+
+ vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
+ vpslldq $8,%xmm10,%xmm10
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
+ vpsrldq $4,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+
+ vpternlogq $0x96,%xmm10,%xmm7,%xmm14
+
+ movq $0,(%rdx)
+
+ movq %r11,%r12
+ movq $16,%r11
+ subq %r12,%r11
+ jmp .L_enc_dec_done_jdCiCmGpmghGfDo
+
+.L_partial_incomplete_jdCiCmGpmghGfDo:
+ addq %r8,(%rdx)
+ movq %r8,%r11
+
+.L_enc_dec_done_jdCiCmGpmghGfDo:
+
+
+ leaq byte_len_to_mask_table(%rip),%r12
+ kmovw (%r12,%r11,2),%k1
+ vmovdqu64 %xmm14,64(%rsi)
+
+ vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
+ vpshufb %xmm5,%xmm3,%xmm3
+ movq %r9,%r12
+ vmovdqu8 %xmm3,(%r12){%k1}
+.L_partial_block_done_jdCiCmGpmghGfDo:
+ vmovdqu64 0(%rsi),%xmm2
+ subq %r11,%r8
+ je .L_enc_dec_done_tFbkipsuzBAeEGF
+ cmpq $256,%r8
+ jbe .L_message_below_equal_16_blocks_tFbkipsuzBAeEGF
+
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vmovdqa64 ddq_addbe_4444(%rip),%zmm27
+ vmovdqa64 ddq_addbe_1234(%rip),%zmm28
+
+
+
+
+
+
+ vmovd %xmm2,%r15d
+ andl $255,%r15d
+
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpshufb %zmm29,%zmm2,%zmm2
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_pFvraahbaffuyct
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_pFvraahbaffuyct
+.L_next_16_overflow_pFvraahbaffuyct:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_pFvraahbaffuyct:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm0
+ vmovdqu8 64(%rcx,%r11,1),%zmm3
+ vmovdqu8 128(%rcx,%r11,1),%zmm4
+ vmovdqu8 192(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,0(%r10,%r11,1)
+ vmovdqu8 %zmm10,64(%r10,%r11,1)
+ vmovdqu8 %zmm11,128(%r10,%r11,1)
+ vmovdqu8 %zmm12,192(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+ vmovdqa64 %zmm7,768(%rsp)
+ vmovdqa64 %zmm10,832(%rsp)
+ vmovdqa64 %zmm11,896(%rsp)
+ vmovdqa64 %zmm12,960(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_yenzjhtagtpjklu
+
+ vmovdqu64 288(%rsi),%zmm0
+ vmovdqu64 %zmm0,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm3
+ vmovdqu64 %zmm3,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 160(%rsi),%zmm4
+ vmovdqu64 %zmm4,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm5
+ vmovdqu64 %zmm5,512(%rsp)
+.L_skip_hkeys_precomputation_yenzjhtagtpjklu:
+ cmpq $512,%r8
+ jb .L_message_below_32_blocks_tFbkipsuzBAeEGF
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_enCpGzovkqzhwzc
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_enCpGzovkqzhwzc
+.L_next_16_overflow_enCpGzovkqzhwzc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_enCpGzovkqzhwzc:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm0
+ vmovdqu8 320(%rcx,%r11,1),%zmm3
+ vmovdqu8 384(%rcx,%r11,1),%zmm4
+ vmovdqu8 448(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,256(%r10,%r11,1)
+ vmovdqu8 %zmm10,320(%r10,%r11,1)
+ vmovdqu8 %zmm11,384(%r10,%r11,1)
+ vmovdqu8 %zmm12,448(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+ vmovdqa64 %zmm7,1024(%rsp)
+ vmovdqa64 %zmm10,1088(%rsp)
+ vmovdqa64 %zmm11,1152(%rsp)
+ vmovdqa64 %zmm12,1216(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_jqGvtcbttbiaDxy
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,0(%rsp)
+.L_skip_hkeys_precomputation_jqGvtcbttbiaDxy:
+ movq $1,%r14
+ addq $512,%r11
+ subq $512,%r8
+
+ cmpq $768,%r8
+ jb .L_no_more_big_nblocks_tFbkipsuzBAeEGF
+.L_encrypt_big_nblocks_tFbkipsuzBAeEGF:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_jddBEjFhbsBAmmE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_jddBEjFhbsBAmmE
+.L_16_blocks_overflow_jddBEjFhbsBAmmE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_jddBEjFhbsBAmmE:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_idpAqFqszdhymlh
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_idpAqFqszdhymlh
+.L_16_blocks_overflow_idpAqFqszdhymlh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_idpAqFqszdhymlh:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_EFGAxoobnnGywoA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_EFGAxoobnnGywoA
+.L_16_blocks_overflow_EFGAxoobnnGywoA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_EFGAxoobnnGywoA:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 512(%rcx,%r11,1),%zmm17
+ vmovdqu8 576(%rcx,%r11,1),%zmm19
+ vmovdqu8 640(%rcx,%r11,1),%zmm20
+ vmovdqu8 704(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+
+
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpternlogq $0x96,%zmm15,%zmm12,%zmm6
+ vpxorq %zmm24,%zmm6,%zmm6
+ vpternlogq $0x96,%zmm10,%zmm13,%zmm7
+ vpxorq %zmm25,%zmm7,%zmm7
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vextracti64x4 $1,%zmm6,%ymm12
+ vpxorq %ymm12,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm12
+ vpxorq %xmm12,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm6
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,512(%r10,%r11,1)
+ vmovdqu8 %zmm3,576(%r10,%r11,1)
+ vmovdqu8 %zmm4,640(%r10,%r11,1)
+ vmovdqu8 %zmm5,704(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1024(%rsp)
+ vmovdqa64 %zmm3,1088(%rsp)
+ vmovdqa64 %zmm4,1152(%rsp)
+ vmovdqa64 %zmm5,1216(%rsp)
+ vmovdqa64 %zmm6,%zmm14
+
+ addq $768,%r11
+ subq $768,%r8
+ cmpq $768,%r8
+ jae .L_encrypt_big_nblocks_tFbkipsuzBAeEGF
+
+.L_no_more_big_nblocks_tFbkipsuzBAeEGF:
+
+ cmpq $512,%r8
+ jae .L_encrypt_32_blocks_tFbkipsuzBAeEGF
+
+ cmpq $256,%r8
+ jae .L_encrypt_16_blocks_tFbkipsuzBAeEGF
+.L_encrypt_0_blocks_ghash_32_tFbkipsuzBAeEGF:
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $256,%ebx
+ subl %r10d,%ebx
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ addl $256,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_uFjiwCxmGEbfAFa
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_uFjiwCxmGEbfAFa
+ jb .L_last_num_blocks_is_7_1_uFjiwCxmGEbfAFa
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_uFjiwCxmGEbfAFa
+ jb .L_last_num_blocks_is_11_9_uFjiwCxmGEbfAFa
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_uFjiwCxmGEbfAFa
+ ja .L_last_num_blocks_is_16_uFjiwCxmGEbfAFa
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_uFjiwCxmGEbfAFa
+ jmp .L_last_num_blocks_is_13_uFjiwCxmGEbfAFa
+
+.L_last_num_blocks_is_11_9_uFjiwCxmGEbfAFa:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_uFjiwCxmGEbfAFa
+ ja .L_last_num_blocks_is_11_uFjiwCxmGEbfAFa
+ jmp .L_last_num_blocks_is_9_uFjiwCxmGEbfAFa
+
+.L_last_num_blocks_is_7_1_uFjiwCxmGEbfAFa:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_uFjiwCxmGEbfAFa
+ jb .L_last_num_blocks_is_3_1_uFjiwCxmGEbfAFa
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_uFjiwCxmGEbfAFa
+ je .L_last_num_blocks_is_6_uFjiwCxmGEbfAFa
+ jmp .L_last_num_blocks_is_5_uFjiwCxmGEbfAFa
+
+.L_last_num_blocks_is_3_1_uFjiwCxmGEbfAFa:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_uFjiwCxmGEbfAFa
+ je .L_last_num_blocks_is_2_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_1_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_lxdjeCteCnqypuE
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_lxdjeCteCnqypuE
+
+.L_16_blocks_overflow_lxdjeCteCnqypuE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_lxdjeCteCnqypuE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xokBAycvbkevxfE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xokBAycvbkevxfE
+.L_small_initial_partial_block_xokBAycvbkevxfE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_xokBAycvbkevxfE
+.L_small_initial_compute_done_xokBAycvbkevxfE:
+.L_after_reduction_xokBAycvbkevxfE:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_2_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_tqAdjGAqcxebbGj
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_tqAdjGAqcxebbGj
+
+.L_16_blocks_overflow_tqAdjGAqcxebbGj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_tqAdjGAqcxebbGj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_izsjBCvaDivghqe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_izsjBCvaDivghqe
+.L_small_initial_partial_block_izsjBCvaDivghqe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_izsjBCvaDivghqe:
+
+ orq %r8,%r8
+ je .L_after_reduction_izsjBCvaDivghqe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_izsjBCvaDivghqe:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_3_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_woFDjhpeDAEyeol
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_woFDjhpeDAEyeol
+
+.L_16_blocks_overflow_woFDjhpeDAEyeol:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_woFDjhpeDAEyeol:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AqCFGymmhaacFDC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AqCFGymmhaacFDC
+.L_small_initial_partial_block_AqCFGymmhaacFDC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AqCFGymmhaacFDC:
+
+ orq %r8,%r8
+ je .L_after_reduction_AqCFGymmhaacFDC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AqCFGymmhaacFDC:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_4_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_BGnDrgfdztzmBGB
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_BGnDrgfdztzmBGB
+
+.L_16_blocks_overflow_BGnDrgfdztzmBGB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_BGnDrgfdztzmBGB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uClitrxBorxFyuy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uClitrxBorxFyuy
+.L_small_initial_partial_block_uClitrxBorxFyuy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uClitrxBorxFyuy:
+
+ orq %r8,%r8
+ je .L_after_reduction_uClitrxBorxFyuy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uClitrxBorxFyuy:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_5_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_wDxAmusyyammDow
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_wDxAmusyyammDow
+
+.L_16_blocks_overflow_wDxAmusyyammDow:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_wDxAmusyyammDow:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bosguzEFytqmFeq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bosguzEFytqmFeq
+.L_small_initial_partial_block_bosguzEFytqmFeq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bosguzEFytqmFeq:
+
+ orq %r8,%r8
+ je .L_after_reduction_bosguzEFytqmFeq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bosguzEFytqmFeq:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_6_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_sCzAAgptixxBvip
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_sCzAAgptixxBvip
+
+.L_16_blocks_overflow_sCzAAgptixxBvip:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_sCzAAgptixxBvip:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FuuimCCibwFkhfx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FuuimCCibwFkhfx
+.L_small_initial_partial_block_FuuimCCibwFkhfx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FuuimCCibwFkhfx:
+
+ orq %r8,%r8
+ je .L_after_reduction_FuuimCCibwFkhfx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FuuimCCibwFkhfx:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_7_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_gqtukwixiotlvjE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_gqtukwixiotlvjE
+
+.L_16_blocks_overflow_gqtukwixiotlvjE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_gqtukwixiotlvjE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CBkCykisCgChyAc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CBkCykisCgChyAc
+.L_small_initial_partial_block_CBkCykisCgChyAc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CBkCykisCgChyAc:
+
+ orq %r8,%r8
+ je .L_after_reduction_CBkCykisCgChyAc
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CBkCykisCgChyAc:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_8_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_Fznlwzcrirmvwxw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_Fznlwzcrirmvwxw
+
+.L_16_blocks_overflow_Fznlwzcrirmvwxw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_Fznlwzcrirmvwxw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BszjzgFAnDlqhlr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BszjzgFAnDlqhlr
+.L_small_initial_partial_block_BszjzgFAnDlqhlr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BszjzgFAnDlqhlr:
+
+ orq %r8,%r8
+ je .L_after_reduction_BszjzgFAnDlqhlr
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BszjzgFAnDlqhlr:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_9_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_nhcklxyaumrucBe
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_nhcklxyaumrucBe
+
+.L_16_blocks_overflow_nhcklxyaumrucBe:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_nhcklxyaumrucBe:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pofwkmqmhmpaDas
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pofwkmqmhmpaDas
+.L_small_initial_partial_block_pofwkmqmhmpaDas:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pofwkmqmhmpaDas:
+
+ orq %r8,%r8
+ je .L_after_reduction_pofwkmqmhmpaDas
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pofwkmqmhmpaDas:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_10_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_DpcajcwBdqbwuEm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_DpcajcwBdqbwuEm
+
+.L_16_blocks_overflow_DpcajcwBdqbwuEm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_DpcajcwBdqbwuEm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GoickdlxxlCgCmn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GoickdlxxlCgCmn
+.L_small_initial_partial_block_GoickdlxxlCgCmn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GoickdlxxlCgCmn:
+
+ orq %r8,%r8
+ je .L_after_reduction_GoickdlxxlCgCmn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_GoickdlxxlCgCmn:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_11_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_CzDGlzuDofcmftE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_CzDGlzuDofcmftE
+
+.L_16_blocks_overflow_CzDGlzuDofcmftE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_CzDGlzuDofcmftE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AfGwErudvfGFkBd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AfGwErudvfGFkBd
+.L_small_initial_partial_block_AfGwErudvfGFkBd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AfGwErudvfGFkBd:
+
+ orq %r8,%r8
+ je .L_after_reduction_AfGwErudvfGFkBd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AfGwErudvfGFkBd:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_12_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_vFgtdmiGGceAuup
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_vFgtdmiGGceAuup
+
+.L_16_blocks_overflow_vFgtdmiGGceAuup:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_vFgtdmiGGceAuup:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hAugcokFGbhzzvx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hAugcokFGbhzzvx
+.L_small_initial_partial_block_hAugcokFGbhzzvx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hAugcokFGbhzzvx:
+
+ orq %r8,%r8
+ je .L_after_reduction_hAugcokFGbhzzvx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hAugcokFGbhzzvx:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_13_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_ApsFAharcbobqcA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_ApsFAharcbobqcA
+
+.L_16_blocks_overflow_ApsFAharcbobqcA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_ApsFAharcbobqcA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DkdftFtqeikgrDl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DkdftFtqeikgrDl
+.L_small_initial_partial_block_DkdftFtqeikgrDl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DkdftFtqeikgrDl:
+
+ orq %r8,%r8
+ je .L_after_reduction_DkdftFtqeikgrDl
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DkdftFtqeikgrDl:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_14_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_amhEEFGkEmcdfyg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_amhEEFGkEmcdfyg
+
+.L_16_blocks_overflow_amhEEFGkEmcdfyg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_amhEEFGkEmcdfyg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DsqdvjyjtgiDdjk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DsqdvjyjtgiDdjk
+.L_small_initial_partial_block_DsqdvjyjtgiDdjk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DsqdvjyjtgiDdjk:
+
+ orq %r8,%r8
+ je .L_after_reduction_DsqdvjyjtgiDdjk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DsqdvjyjtgiDdjk:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_15_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_GyCmDqABriaxjxf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_GyCmDqABriaxjxf
+
+.L_16_blocks_overflow_GyCmDqABriaxjxf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_GyCmDqABriaxjxf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pGoiupmcfezlCDb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pGoiupmcfezlCDb
+.L_small_initial_partial_block_pGoiupmcfezlCDb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pGoiupmcfezlCDb:
+
+ orq %r8,%r8
+ je .L_after_reduction_pGoiupmcfezlCDb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pGoiupmcfezlCDb:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_16_uFjiwCxmGEbfAFa:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_imDahqossjyafvG
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_imDahqossjyafvG
+
+.L_16_blocks_overflow_imDahqossjyafvG:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_imDahqossjyafvG:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_stpCjmquwqkvlEu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_stpCjmquwqkvlEu:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_stpCjmquwqkvlEu:
+ jmp .L_last_blocks_done_uFjiwCxmGEbfAFa
+.L_last_num_blocks_is_0_uFjiwCxmGEbfAFa:
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_uFjiwCxmGEbfAFa:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_tFbkipsuzBAeEGF
+.L_encrypt_32_blocks_tFbkipsuzBAeEGF:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_AGsgmucxjDjGrat
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_AGsgmucxjDjGrat
+.L_16_blocks_overflow_AGsgmucxjDjGrat:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_AGsgmucxjDjGrat:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_miCaCzFgEsdrxCb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_miCaCzFgEsdrxCb
+.L_16_blocks_overflow_miCaCzFgEsdrxCb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_miCaCzFgEsdrxCb:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+ subq $512,%r8
+ addq $512,%r11
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_jcdFbiukBEavFGE
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_jcdFbiukBEavFGE
+ jb .L_last_num_blocks_is_7_1_jcdFbiukBEavFGE
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_jcdFbiukBEavFGE
+ jb .L_last_num_blocks_is_11_9_jcdFbiukBEavFGE
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_jcdFbiukBEavFGE
+ ja .L_last_num_blocks_is_16_jcdFbiukBEavFGE
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_jcdFbiukBEavFGE
+ jmp .L_last_num_blocks_is_13_jcdFbiukBEavFGE
+
+.L_last_num_blocks_is_11_9_jcdFbiukBEavFGE:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_jcdFbiukBEavFGE
+ ja .L_last_num_blocks_is_11_jcdFbiukBEavFGE
+ jmp .L_last_num_blocks_is_9_jcdFbiukBEavFGE
+
+.L_last_num_blocks_is_7_1_jcdFbiukBEavFGE:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_jcdFbiukBEavFGE
+ jb .L_last_num_blocks_is_3_1_jcdFbiukBEavFGE
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_jcdFbiukBEavFGE
+ je .L_last_num_blocks_is_6_jcdFbiukBEavFGE
+ jmp .L_last_num_blocks_is_5_jcdFbiukBEavFGE
+
+.L_last_num_blocks_is_3_1_jcdFbiukBEavFGE:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_jcdFbiukBEavFGE
+ je .L_last_num_blocks_is_2_jcdFbiukBEavFGE
+.L_last_num_blocks_is_1_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_vxxnDcnfkrwsdjp
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_vxxnDcnfkrwsdjp
+
+.L_16_blocks_overflow_vxxnDcnfkrwsdjp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_vxxnDcnfkrwsdjp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rjcmxpckvzxcizE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rjcmxpckvzxcizE
+.L_small_initial_partial_block_rjcmxpckvzxcizE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_rjcmxpckvzxcizE
+.L_small_initial_compute_done_rjcmxpckvzxcizE:
+.L_after_reduction_rjcmxpckvzxcizE:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_2_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_uhDoynhcngzlgum
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_uhDoynhcngzlgum
+
+.L_16_blocks_overflow_uhDoynhcngzlgum:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_uhDoynhcngzlgum:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uukoDhouhnxbvBs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uukoDhouhnxbvBs
+.L_small_initial_partial_block_uukoDhouhnxbvBs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uukoDhouhnxbvBs:
+
+ orq %r8,%r8
+ je .L_after_reduction_uukoDhouhnxbvBs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uukoDhouhnxbvBs:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_3_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_uqbvqDscdfzCyvo
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_uqbvqDscdfzCyvo
+
+.L_16_blocks_overflow_uqbvqDscdfzCyvo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_uqbvqDscdfzCyvo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AzBBwGideFptDwf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AzBBwGideFptDwf
+.L_small_initial_partial_block_AzBBwGideFptDwf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AzBBwGideFptDwf:
+
+ orq %r8,%r8
+ je .L_after_reduction_AzBBwGideFptDwf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AzBBwGideFptDwf:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_4_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_kyFozElpAosldpA
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_kyFozElpAosldpA
+
+.L_16_blocks_overflow_kyFozElpAosldpA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_kyFozElpAosldpA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cyDyceqdwxjBzzg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cyDyceqdwxjBzzg
+.L_small_initial_partial_block_cyDyceqdwxjBzzg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cyDyceqdwxjBzzg:
+
+ orq %r8,%r8
+ je .L_after_reduction_cyDyceqdwxjBzzg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_cyDyceqdwxjBzzg:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_5_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_lFprftfcjilzpav
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_lFprftfcjilzpav
+
+.L_16_blocks_overflow_lFprftfcjilzpav:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_lFprftfcjilzpav:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pGBzEdwhzcavspd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pGBzEdwhzcavspd
+.L_small_initial_partial_block_pGBzEdwhzcavspd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pGBzEdwhzcavspd:
+
+ orq %r8,%r8
+ je .L_after_reduction_pGBzEdwhzcavspd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pGBzEdwhzcavspd:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_6_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_GkzjxqDyGdedavo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_GkzjxqDyGdedavo
+
+.L_16_blocks_overflow_GkzjxqDyGdedavo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_GkzjxqDyGdedavo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_owicnDDzeheGwrB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_owicnDDzeheGwrB
+.L_small_initial_partial_block_owicnDDzeheGwrB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_owicnDDzeheGwrB:
+
+ orq %r8,%r8
+ je .L_after_reduction_owicnDDzeheGwrB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_owicnDDzeheGwrB:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_7_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_CaCztGdjulthntc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_CaCztGdjulthntc
+
+.L_16_blocks_overflow_CaCztGdjulthntc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_CaCztGdjulthntc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_davwqylkhqewajl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_davwqylkhqewajl
+.L_small_initial_partial_block_davwqylkhqewajl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_davwqylkhqewajl:
+
+ orq %r8,%r8
+ je .L_after_reduction_davwqylkhqewajl
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_davwqylkhqewajl:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_8_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_GbaqslwpsaFuoyz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_GbaqslwpsaFuoyz
+
+.L_16_blocks_overflow_GbaqslwpsaFuoyz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_GbaqslwpsaFuoyz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FelclvrviuByirb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FelclvrviuByirb
+.L_small_initial_partial_block_FelclvrviuByirb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FelclvrviuByirb:
+
+ orq %r8,%r8
+ je .L_after_reduction_FelclvrviuByirb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FelclvrviuByirb:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_9_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_AplsctBswkCkEgg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_AplsctBswkCkEgg
+
+.L_16_blocks_overflow_AplsctBswkCkEgg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_AplsctBswkCkEgg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jtFtADjqFyogvlv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jtFtADjqFyogvlv
+.L_small_initial_partial_block_jtFtADjqFyogvlv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jtFtADjqFyogvlv:
+
+ orq %r8,%r8
+ je .L_after_reduction_jtFtADjqFyogvlv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jtFtADjqFyogvlv:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_10_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_sGofikfdvCsyufv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_sGofikfdvCsyufv
+
+.L_16_blocks_overflow_sGofikfdvCsyufv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_sGofikfdvCsyufv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tcfdrpyrpqxjGcq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tcfdrpyrpqxjGcq
+.L_small_initial_partial_block_tcfdrpyrpqxjGcq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tcfdrpyrpqxjGcq:
+
+ orq %r8,%r8
+ je .L_after_reduction_tcfdrpyrpqxjGcq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tcfdrpyrpqxjGcq:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_11_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_toAwkfvytGCcuzd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_toAwkfvytGCcuzd
+
+.L_16_blocks_overflow_toAwkfvytGCcuzd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_toAwkfvytGCcuzd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wlcDxsmFdsaDbFp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wlcDxsmFdsaDbFp
+.L_small_initial_partial_block_wlcDxsmFdsaDbFp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wlcDxsmFdsaDbFp:
+
+ orq %r8,%r8
+ je .L_after_reduction_wlcDxsmFdsaDbFp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wlcDxsmFdsaDbFp:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_12_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_teGFdCBFbFbgpyu
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_teGFdCBFbFbgpyu
+
+.L_16_blocks_overflow_teGFdCBFbFbgpyu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_teGFdCBFbFbgpyu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hapodhDjogGiCkb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hapodhDjogGiCkb
+.L_small_initial_partial_block_hapodhDjogGiCkb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hapodhDjogGiCkb:
+
+ orq %r8,%r8
+ je .L_after_reduction_hapodhDjogGiCkb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hapodhDjogGiCkb:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_13_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_EcrGhzkACEdjiEA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_EcrGhzkACEdjiEA
+
+.L_16_blocks_overflow_EcrGhzkACEdjiEA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_EcrGhzkACEdjiEA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lgpADhokDilDmjB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lgpADhokDilDmjB
+.L_small_initial_partial_block_lgpADhokDilDmjB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lgpADhokDilDmjB:
+
+ orq %r8,%r8
+ je .L_after_reduction_lgpADhokDilDmjB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lgpADhokDilDmjB:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_14_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_vfAlEigAGAFFgAm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_vfAlEigAGAFFgAm
+
+.L_16_blocks_overflow_vfAlEigAGAFFgAm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_vfAlEigAGAFFgAm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jvziCnlsAiEavam
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jvziCnlsAiEavam
+.L_small_initial_partial_block_jvziCnlsAiEavam:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jvziCnlsAiEavam:
+
+ orq %r8,%r8
+ je .L_after_reduction_jvziCnlsAiEavam
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jvziCnlsAiEavam:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_15_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_vDsgChtGCDEtEvr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_vDsgChtGCDEtEvr
+
+.L_16_blocks_overflow_vDsgChtGCDEtEvr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_vDsgChtGCDEtEvr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aaoEnbdnBGewaEG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aaoEnbdnBGewaEG
+.L_small_initial_partial_block_aaoEnbdnBGewaEG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_aaoEnbdnBGewaEG:
+
+ orq %r8,%r8
+ je .L_after_reduction_aaoEnbdnBGewaEG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_aaoEnbdnBGewaEG:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_16_jcdFbiukBEavFGE:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_rGdvngzaeGtrlsf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_rGdvngzaeGtrlsf
+
+.L_16_blocks_overflow_rGdvngzaeGtrlsf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_rGdvngzaeGtrlsf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_llADlmtFjlEejxe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_llADlmtFjlEejxe:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_llADlmtFjlEejxe:
+ jmp .L_last_blocks_done_jcdFbiukBEavFGE
+.L_last_num_blocks_is_0_jcdFbiukBEavFGE:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_jcdFbiukBEavFGE:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_tFbkipsuzBAeEGF
+.L_encrypt_16_blocks_tFbkipsuzBAeEGF:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_AfdGcFddyowgCfD
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_AfdGcFddyowgCfD
+.L_16_blocks_overflow_AfdGcFddyowgCfD:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_AfdGcFddyowgCfD:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 256(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 320(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 384(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 448(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_DkxrwjzcAFtwGmv
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_DkxrwjzcAFtwGmv
+ jb .L_last_num_blocks_is_7_1_DkxrwjzcAFtwGmv
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_DkxrwjzcAFtwGmv
+ jb .L_last_num_blocks_is_11_9_DkxrwjzcAFtwGmv
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_DkxrwjzcAFtwGmv
+ ja .L_last_num_blocks_is_16_DkxrwjzcAFtwGmv
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_DkxrwjzcAFtwGmv
+ jmp .L_last_num_blocks_is_13_DkxrwjzcAFtwGmv
+
+.L_last_num_blocks_is_11_9_DkxrwjzcAFtwGmv:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_DkxrwjzcAFtwGmv
+ ja .L_last_num_blocks_is_11_DkxrwjzcAFtwGmv
+ jmp .L_last_num_blocks_is_9_DkxrwjzcAFtwGmv
+
+.L_last_num_blocks_is_7_1_DkxrwjzcAFtwGmv:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_DkxrwjzcAFtwGmv
+ jb .L_last_num_blocks_is_3_1_DkxrwjzcAFtwGmv
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_DkxrwjzcAFtwGmv
+ je .L_last_num_blocks_is_6_DkxrwjzcAFtwGmv
+ jmp .L_last_num_blocks_is_5_DkxrwjzcAFtwGmv
+
+.L_last_num_blocks_is_3_1_DkxrwjzcAFtwGmv:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_DkxrwjzcAFtwGmv
+ je .L_last_num_blocks_is_2_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_1_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_AeBdutzBBGkrhww
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_AeBdutzBBGkrhww
+
+.L_16_blocks_overflow_AeBdutzBBGkrhww:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_AeBdutzBBGkrhww:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %xmm31,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sanDChDEAsbDbDy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sanDChDEAsbDbDy
+.L_small_initial_partial_block_sanDChDEAsbDbDy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_sanDChDEAsbDbDy
+.L_small_initial_compute_done_sanDChDEAsbDbDy:
+.L_after_reduction_sanDChDEAsbDbDy:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_2_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_zEobAyflaqodkxt
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_zEobAyflaqodkxt
+
+.L_16_blocks_overflow_zEobAyflaqodkxt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_zEobAyflaqodkxt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %ymm31,%ymm0,%ymm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_btzmvhkGEADbAkx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_btzmvhkGEADbAkx
+.L_small_initial_partial_block_btzmvhkGEADbAkx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_btzmvhkGEADbAkx:
+
+ orq %r8,%r8
+ je .L_after_reduction_btzmvhkGEADbAkx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_btzmvhkGEADbAkx:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_3_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_gcfAxoFzqodzGEz
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_gcfAxoFzqodzGEz
+
+.L_16_blocks_overflow_gcfAxoFzqodzGEz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_gcfAxoFzqodzGEz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EasBgBicpEglkiw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EasBgBicpEglkiw
+.L_small_initial_partial_block_EasBgBicpEglkiw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EasBgBicpEglkiw:
+
+ orq %r8,%r8
+ je .L_after_reduction_EasBgBicpEglkiw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_EasBgBicpEglkiw:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_4_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_manbGbfyvfFsrnl
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_manbGbfyvfFsrnl
+
+.L_16_blocks_overflow_manbGbfyvfFsrnl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_manbGbfyvfFsrnl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kwtpvxfGBCymBsb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kwtpvxfGBCymBsb
+.L_small_initial_partial_block_kwtpvxfGBCymBsb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kwtpvxfGBCymBsb:
+
+ orq %r8,%r8
+ je .L_after_reduction_kwtpvxfGBCymBsb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_kwtpvxfGBCymBsb:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_5_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_fjElnuxjdEdFEct
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_fjElnuxjdEdFEct
+
+.L_16_blocks_overflow_fjElnuxjdEdFEct:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_fjElnuxjdEdFEct:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DbgCAmgvxscuoqv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DbgCAmgvxscuoqv
+.L_small_initial_partial_block_DbgCAmgvxscuoqv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DbgCAmgvxscuoqv:
+
+ orq %r8,%r8
+ je .L_after_reduction_DbgCAmgvxscuoqv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DbgCAmgvxscuoqv:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_6_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_tfrvDdzahijbwmB
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_tfrvDdzahijbwmB
+
+.L_16_blocks_overflow_tfrvDdzahijbwmB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_tfrvDdzahijbwmB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uEnwhzkdGwAplec
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uEnwhzkdGwAplec
+.L_small_initial_partial_block_uEnwhzkdGwAplec:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uEnwhzkdGwAplec:
+
+ orq %r8,%r8
+ je .L_after_reduction_uEnwhzkdGwAplec
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uEnwhzkdGwAplec:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_7_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_qidtflFxFddzhgg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_qidtflFxFddzhgg
+
+.L_16_blocks_overflow_qidtflFxFddzhgg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_qidtflFxFddzhgg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qvicAgCgBiisxsr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qvicAgCgBiisxsr
+.L_small_initial_partial_block_qvicAgCgBiisxsr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qvicAgCgBiisxsr:
+
+ orq %r8,%r8
+ je .L_after_reduction_qvicAgCgBiisxsr
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qvicAgCgBiisxsr:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_8_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_luzsesiwggypeey
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_luzsesiwggypeey
+
+.L_16_blocks_overflow_luzsesiwggypeey:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_luzsesiwggypeey:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dhgyBxajscbfima
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dhgyBxajscbfima
+.L_small_initial_partial_block_dhgyBxajscbfima:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dhgyBxajscbfima:
+
+ orq %r8,%r8
+ je .L_after_reduction_dhgyBxajscbfima
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dhgyBxajscbfima:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_9_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_EkueqaGdhDjCdgp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_EkueqaGdhDjCdgp
+
+.L_16_blocks_overflow_EkueqaGdhDjCdgp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_EkueqaGdhDjCdgp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_heqAoqbbuAkcyrx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_heqAoqbbuAkcyrx
+.L_small_initial_partial_block_heqAoqbbuAkcyrx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_heqAoqbbuAkcyrx:
+
+ orq %r8,%r8
+ je .L_after_reduction_heqAoqbbuAkcyrx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_heqAoqbbuAkcyrx:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_10_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_wvgCfboudsrmujp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_wvgCfboudsrmujp
+
+.L_16_blocks_overflow_wvgCfboudsrmujp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_wvgCfboudsrmujp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yxeqEqghwAplnqh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yxeqEqghwAplnqh
+.L_small_initial_partial_block_yxeqEqghwAplnqh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yxeqEqghwAplnqh:
+
+ orq %r8,%r8
+ je .L_after_reduction_yxeqEqghwAplnqh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_yxeqEqghwAplnqh:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_11_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_cwemdvzqaqrBmvF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_cwemdvzqaqrBmvF
+
+.L_16_blocks_overflow_cwemdvzqaqrBmvF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_cwemdvzqaqrBmvF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tngolGfEmxmwAAg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tngolGfEmxmwAAg
+.L_small_initial_partial_block_tngolGfEmxmwAAg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tngolGfEmxmwAAg:
+
+ orq %r8,%r8
+ je .L_after_reduction_tngolGfEmxmwAAg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tngolGfEmxmwAAg:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_12_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_viscCxhaitpgcDa
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_viscCxhaitpgcDa
+
+.L_16_blocks_overflow_viscCxhaitpgcDa:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_viscCxhaitpgcDa:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AEGqAevCpluaCEe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AEGqAevCpluaCEe
+.L_small_initial_partial_block_AEGqAevCpluaCEe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AEGqAevCpluaCEe:
+
+ orq %r8,%r8
+ je .L_after_reduction_AEGqAevCpluaCEe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AEGqAevCpluaCEe:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_13_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_aswqypGGFyocuvD
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_aswqypGGFyocuvD
+
+.L_16_blocks_overflow_aswqypGGFyocuvD:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_aswqypGGFyocuvD:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ddibpDBalvcbdjr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ddibpDBalvcbdjr
+.L_small_initial_partial_block_ddibpDBalvcbdjr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ddibpDBalvcbdjr:
+
+ orq %r8,%r8
+ je .L_after_reduction_ddibpDBalvcbdjr
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ddibpDBalvcbdjr:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_14_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_uDoedupEeCpfBar
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_uDoedupEeCpfBar
+
+.L_16_blocks_overflow_uDoedupEeCpfBar:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_uDoedupEeCpfBar:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AilxjDdBvvoizqE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AilxjDdBvvoizqE
+.L_small_initial_partial_block_AilxjDdBvvoizqE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AilxjDdBvvoizqE:
+
+ orq %r8,%r8
+ je .L_after_reduction_AilxjDdBvvoizqE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AilxjDdBvvoizqE:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_15_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_qsiCcemvFCbgltw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_qsiCcemvFCbgltw
+
+.L_16_blocks_overflow_qsiCcemvFCbgltw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_qsiCcemvFCbgltw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uvFingxredipaxs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uvFingxredipaxs
+.L_small_initial_partial_block_uvFingxredipaxs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uvFingxredipaxs:
+
+ orq %r8,%r8
+ je .L_after_reduction_uvFingxredipaxs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uvFingxredipaxs:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_16_DkxrwjzcAFtwGmv:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_pAbgwDdgnghCfey
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_pAbgwDdgnghCfey
+
+.L_16_blocks_overflow_pAbgwDdgnghCfey:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_pAbgwDdgnghCfey:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_fFkawEbFoBxjEyl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fFkawEbFoBxjEyl:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fFkawEbFoBxjEyl:
+ jmp .L_last_blocks_done_DkxrwjzcAFtwGmv
+.L_last_num_blocks_is_0_DkxrwjzcAFtwGmv:
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_DkxrwjzcAFtwGmv:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_tFbkipsuzBAeEGF
+
+.L_message_below_32_blocks_tFbkipsuzBAeEGF:
+
+
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_lpEjyDrFbrgBuyj
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+.L_skip_hkeys_precomputation_lpEjyDrFbrgBuyj:
+ movq $1,%r14
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_wmGtzaxjkAduAzk
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_wmGtzaxjkAduAzk
+ jb .L_last_num_blocks_is_7_1_wmGtzaxjkAduAzk
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_wmGtzaxjkAduAzk
+ jb .L_last_num_blocks_is_11_9_wmGtzaxjkAduAzk
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_wmGtzaxjkAduAzk
+ ja .L_last_num_blocks_is_16_wmGtzaxjkAduAzk
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_wmGtzaxjkAduAzk
+ jmp .L_last_num_blocks_is_13_wmGtzaxjkAduAzk
+
+.L_last_num_blocks_is_11_9_wmGtzaxjkAduAzk:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_wmGtzaxjkAduAzk
+ ja .L_last_num_blocks_is_11_wmGtzaxjkAduAzk
+ jmp .L_last_num_blocks_is_9_wmGtzaxjkAduAzk
+
+.L_last_num_blocks_is_7_1_wmGtzaxjkAduAzk:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_wmGtzaxjkAduAzk
+ jb .L_last_num_blocks_is_3_1_wmGtzaxjkAduAzk
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_wmGtzaxjkAduAzk
+ je .L_last_num_blocks_is_6_wmGtzaxjkAduAzk
+ jmp .L_last_num_blocks_is_5_wmGtzaxjkAduAzk
+
+.L_last_num_blocks_is_3_1_wmGtzaxjkAduAzk:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_wmGtzaxjkAduAzk
+ je .L_last_num_blocks_is_2_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_1_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_zAppBdlpFnqjcjn
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_zAppBdlpFnqjcjn
+
+.L_16_blocks_overflow_zAppBdlpFnqjcjn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_zAppBdlpFnqjcjn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ohletviGGDnsqsh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ohletviGGDnsqsh
+.L_small_initial_partial_block_ohletviGGDnsqsh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_ohletviGGDnsqsh
+.L_small_initial_compute_done_ohletviGGDnsqsh:
+.L_after_reduction_ohletviGGDnsqsh:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_2_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_bApGhpvksEbgnlq
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_bApGhpvksEbgnlq
+
+.L_16_blocks_overflow_bApGhpvksEbgnlq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_bApGhpvksEbgnlq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_atfqpoawbrCaGCo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_atfqpoawbrCaGCo
+.L_small_initial_partial_block_atfqpoawbrCaGCo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_atfqpoawbrCaGCo:
+
+ orq %r8,%r8
+ je .L_after_reduction_atfqpoawbrCaGCo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_atfqpoawbrCaGCo:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_3_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_ngmcavmrDqtqduc
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_ngmcavmrDqtqduc
+
+.L_16_blocks_overflow_ngmcavmrDqtqduc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_ngmcavmrDqtqduc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EgjBqgvkBgauzsF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EgjBqgvkBgauzsF
+.L_small_initial_partial_block_EgjBqgvkBgauzsF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EgjBqgvkBgauzsF:
+
+ orq %r8,%r8
+ je .L_after_reduction_EgjBqgvkBgauzsF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_EgjBqgvkBgauzsF:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_4_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_oDoDxdeeEEpoaof
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_oDoDxdeeEEpoaof
+
+.L_16_blocks_overflow_oDoDxdeeEEpoaof:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_oDoDxdeeEEpoaof:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_akFyBqpssGEhllv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_akFyBqpssGEhllv
+.L_small_initial_partial_block_akFyBqpssGEhllv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_akFyBqpssGEhllv:
+
+ orq %r8,%r8
+ je .L_after_reduction_akFyBqpssGEhllv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_akFyBqpssGEhllv:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_5_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_vwvElrjpjpxAvis
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_vwvElrjpjpxAvis
+
+.L_16_blocks_overflow_vwvElrjpjpxAvis:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_vwvElrjpjpxAvis:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DFFzfAbyBGFnoDn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DFFzfAbyBGFnoDn
+.L_small_initial_partial_block_DFFzfAbyBGFnoDn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DFFzfAbyBGFnoDn:
+
+ orq %r8,%r8
+ je .L_after_reduction_DFFzfAbyBGFnoDn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DFFzfAbyBGFnoDn:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_6_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_vyDvhDFpixkDdnk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_vyDvhDFpixkDdnk
+
+.L_16_blocks_overflow_vyDvhDFpixkDdnk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_vyDvhDFpixkDdnk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FEocggExrFlAoic
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FEocggExrFlAoic
+.L_small_initial_partial_block_FEocggExrFlAoic:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FEocggExrFlAoic:
+
+ orq %r8,%r8
+ je .L_after_reduction_FEocggExrFlAoic
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FEocggExrFlAoic:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_7_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_fvtxctukrBFoshm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_fvtxctukrBFoshm
+
+.L_16_blocks_overflow_fvtxctukrBFoshm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_fvtxctukrBFoshm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zsgnBgnADqqaFdG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zsgnBgnADqqaFdG
+.L_small_initial_partial_block_zsgnBgnADqqaFdG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zsgnBgnADqqaFdG:
+
+ orq %r8,%r8
+ je .L_after_reduction_zsgnBgnADqqaFdG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zsgnBgnADqqaFdG:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_8_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_ACyFnxEijEcdofC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_ACyFnxEijEcdofC
+
+.L_16_blocks_overflow_ACyFnxEijEcdofC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_ACyFnxEijEcdofC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pinsyEqvsAdoiak
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pinsyEqvsAdoiak
+.L_small_initial_partial_block_pinsyEqvsAdoiak:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pinsyEqvsAdoiak:
+
+ orq %r8,%r8
+ je .L_after_reduction_pinsyEqvsAdoiak
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pinsyEqvsAdoiak:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_9_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_AhlgEzovddtvDon
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_AhlgEzovddtvDon
+
+.L_16_blocks_overflow_AhlgEzovddtvDon:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_AhlgEzovddtvDon:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dgkfebGqcuDCjgt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dgkfebGqcuDCjgt
+.L_small_initial_partial_block_dgkfebGqcuDCjgt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dgkfebGqcuDCjgt:
+
+ orq %r8,%r8
+ je .L_after_reduction_dgkfebGqcuDCjgt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dgkfebGqcuDCjgt:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_10_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_AcoEnlwuyyjhDuq
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_AcoEnlwuyyjhDuq
+
+.L_16_blocks_overflow_AcoEnlwuyyjhDuq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_AcoEnlwuyyjhDuq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_upsmGyaxeoyuGwq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_upsmGyaxeoyuGwq
+.L_small_initial_partial_block_upsmGyaxeoyuGwq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_upsmGyaxeoyuGwq:
+
+ orq %r8,%r8
+ je .L_after_reduction_upsmGyaxeoyuGwq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_upsmGyaxeoyuGwq:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_11_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_coDokyrbzujjnFG
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_coDokyrbzujjnFG
+
+.L_16_blocks_overflow_coDokyrbzujjnFG:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_coDokyrbzujjnFG:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dtFFjiEElouyrlF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dtFFjiEElouyrlF
+.L_small_initial_partial_block_dtFFjiEElouyrlF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dtFFjiEElouyrlF:
+
+ orq %r8,%r8
+ je .L_after_reduction_dtFFjiEElouyrlF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dtFFjiEElouyrlF:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_12_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_uvhijsplaEEmlke
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_uvhijsplaEEmlke
+
+.L_16_blocks_overflow_uvhijsplaEEmlke:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_uvhijsplaEEmlke:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sArmCAuDwnDnahw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sArmCAuDwnDnahw
+.L_small_initial_partial_block_sArmCAuDwnDnahw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sArmCAuDwnDnahw:
+
+ orq %r8,%r8
+ je .L_after_reduction_sArmCAuDwnDnahw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_sArmCAuDwnDnahw:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_13_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_dCqAGwyhtFDDhuf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_dCqAGwyhtFDDhuf
+
+.L_16_blocks_overflow_dCqAGwyhtFDDhuf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_dCqAGwyhtFDDhuf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AoFriGggjmCqdFe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AoFriGggjmCqdFe
+.L_small_initial_partial_block_AoFriGggjmCqdFe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AoFriGggjmCqdFe:
+
+ orq %r8,%r8
+ je .L_after_reduction_AoFriGggjmCqdFe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AoFriGggjmCqdFe:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_14_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_eymtigzEympdfbq
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_eymtigzEympdfbq
+
+.L_16_blocks_overflow_eymtigzEympdfbq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_eymtigzEympdfbq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_psAhdEAgnjgwhnp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_psAhdEAgnjgwhnp
+.L_small_initial_partial_block_psAhdEAgnjgwhnp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_psAhdEAgnjgwhnp:
+
+ orq %r8,%r8
+ je .L_after_reduction_psAhdEAgnjgwhnp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_psAhdEAgnjgwhnp:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_15_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_qGavfpFFnvaCwAd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_qGavfpFFnvaCwAd
+
+.L_16_blocks_overflow_qGavfpFFnvaCwAd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_qGavfpFFnvaCwAd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DBkpyuBbpopmDCv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DBkpyuBbpopmDCv
+.L_small_initial_partial_block_DBkpyuBbpopmDCv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DBkpyuBbpopmDCv:
+
+ orq %r8,%r8
+ je .L_after_reduction_DBkpyuBbpopmDCv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DBkpyuBbpopmDCv:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_16_wmGtzaxjkAduAzk:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_jfFqqEmsqrheBbh
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_jfFqqEmsqrheBbh
+
+.L_16_blocks_overflow_jfFqqEmsqrheBbh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_jfFqqEmsqrheBbh:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_CEafoEfoaioCrtB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CEafoEfoaioCrtB:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CEafoEfoaioCrtB:
+ jmp .L_last_blocks_done_wmGtzaxjkAduAzk
+.L_last_num_blocks_is_0_wmGtzaxjkAduAzk:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_wmGtzaxjkAduAzk:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_tFbkipsuzBAeEGF
+
+.L_message_below_equal_16_blocks_tFbkipsuzBAeEGF:
+
+
+ movl %r8d,%r12d
+ addl $15,%r12d
+ shrl $4,%r12d
+ cmpq $8,%r12
+ je .L_small_initial_num_blocks_is_8_tpcppgjkDAAGbmz
+ jl .L_small_initial_num_blocks_is_7_1_tpcppgjkDAAGbmz
+
+
+ cmpq $12,%r12
+ je .L_small_initial_num_blocks_is_12_tpcppgjkDAAGbmz
+ jl .L_small_initial_num_blocks_is_11_9_tpcppgjkDAAGbmz
+
+
+ cmpq $16,%r12
+ je .L_small_initial_num_blocks_is_16_tpcppgjkDAAGbmz
+ cmpq $15,%r12
+ je .L_small_initial_num_blocks_is_15_tpcppgjkDAAGbmz
+ cmpq $14,%r12
+ je .L_small_initial_num_blocks_is_14_tpcppgjkDAAGbmz
+ jmp .L_small_initial_num_blocks_is_13_tpcppgjkDAAGbmz
+
+.L_small_initial_num_blocks_is_11_9_tpcppgjkDAAGbmz:
+
+ cmpq $11,%r12
+ je .L_small_initial_num_blocks_is_11_tpcppgjkDAAGbmz
+ cmpq $10,%r12
+ je .L_small_initial_num_blocks_is_10_tpcppgjkDAAGbmz
+ jmp .L_small_initial_num_blocks_is_9_tpcppgjkDAAGbmz
+
+.L_small_initial_num_blocks_is_7_1_tpcppgjkDAAGbmz:
+ cmpq $4,%r12
+ je .L_small_initial_num_blocks_is_4_tpcppgjkDAAGbmz
+ jl .L_small_initial_num_blocks_is_3_1_tpcppgjkDAAGbmz
+
+ cmpq $7,%r12
+ je .L_small_initial_num_blocks_is_7_tpcppgjkDAAGbmz
+ cmpq $6,%r12
+ je .L_small_initial_num_blocks_is_6_tpcppgjkDAAGbmz
+ jmp .L_small_initial_num_blocks_is_5_tpcppgjkDAAGbmz
+
+.L_small_initial_num_blocks_is_3_1_tpcppgjkDAAGbmz:
+
+ cmpq $3,%r12
+ je .L_small_initial_num_blocks_is_3_tpcppgjkDAAGbmz
+ cmpq $2,%r12
+ je .L_small_initial_num_blocks_is_2_tpcppgjkDAAGbmz
+
+
+
+
+
+.L_small_initial_num_blocks_is_1_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%xmm29
+ vpaddd ONE(%rip),%xmm2,%xmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vpshufb %xmm29,%xmm0,%xmm0
+ vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %xmm15,%xmm0,%xmm0
+ vpxorq %xmm6,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm6
+ vextracti32x4 $0,%zmm6,%xmm13
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vkGpbehGialtrzj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vkGpbehGialtrzj
+.L_small_initial_partial_block_vkGpbehGialtrzj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm13,%xmm14,%xmm14
+
+ jmp .L_after_reduction_vkGpbehGialtrzj
+.L_small_initial_compute_done_vkGpbehGialtrzj:
+.L_after_reduction_vkGpbehGialtrzj:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_2_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%ymm29
+ vshufi64x2 $0,%ymm2,%ymm2,%ymm0
+ vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vpshufb %ymm29,%ymm0,%ymm0
+ vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %ymm15,%ymm0,%ymm0
+ vpxorq %ymm6,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm6
+ vextracti32x4 $1,%zmm6,%xmm13
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yrCuttqEucBxwFi
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yrCuttqEucBxwFi
+.L_small_initial_partial_block_yrCuttqEucBxwFi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yrCuttqEucBxwFi:
+
+ orq %r8,%r8
+ je .L_after_reduction_yrCuttqEucBxwFi
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_yrCuttqEucBxwFi:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_3_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vextracti32x4 $2,%zmm6,%xmm13
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kgsCrgatEoGephk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kgsCrgatEoGephk
+.L_small_initial_partial_block_kgsCrgatEoGephk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kgsCrgatEoGephk:
+
+ orq %r8,%r8
+ je .L_after_reduction_kgsCrgatEoGephk
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_kgsCrgatEoGephk:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_4_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vextracti32x4 $3,%zmm6,%xmm13
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_flxrhfiogcrnqye
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_flxrhfiogcrnqye
+.L_small_initial_partial_block_flxrhfiogcrnqye:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_flxrhfiogcrnqye:
+
+ orq %r8,%r8
+ je .L_after_reduction_flxrhfiogcrnqye
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_flxrhfiogcrnqye:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_5_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %xmm7,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %xmm29,%xmm3,%xmm7
+ vextracti32x4 $0,%zmm7,%xmm13
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gFzmwxijGDfbEEt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gFzmwxijGDfbEEt
+.L_small_initial_partial_block_gFzmwxijGDfbEEt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gFzmwxijGDfbEEt:
+
+ orq %r8,%r8
+ je .L_after_reduction_gFzmwxijGDfbEEt
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_gFzmwxijGDfbEEt:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_6_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %ymm15,%ymm3,%ymm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %ymm7,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %ymm29,%ymm3,%ymm7
+ vextracti32x4 $1,%zmm7,%xmm13
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ywvaiFFsGziikok
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ywvaiFFsGziikok
+.L_small_initial_partial_block_ywvaiFFsGziikok:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ywvaiFFsGziikok:
+
+ orq %r8,%r8
+ je .L_after_reduction_ywvaiFFsGziikok
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ywvaiFFsGziikok:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_7_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vextracti32x4 $2,%zmm7,%xmm13
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vjjxFhBDbbgteCx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vjjxFhBDbbgteCx
+.L_small_initial_partial_block_vjjxFhBDbbgteCx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vjjxFhBDbbgteCx:
+
+ orq %r8,%r8
+ je .L_after_reduction_vjjxFhBDbbgteCx
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_vjjxFhBDbbgteCx:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_8_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vextracti32x4 $3,%zmm7,%xmm13
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jvbFniEeBiBFBmv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jvbFniEeBiBFBmv
+.L_small_initial_partial_block_jvbFniEeBiBFBmv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jvbFniEeBiBFBmv:
+
+ orq %r8,%r8
+ je .L_after_reduction_jvbFniEeBiBFBmv
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_jvbFniEeBiBFBmv:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_9_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %xmm10,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %xmm29,%xmm4,%xmm10
+ vextracti32x4 $0,%zmm10,%xmm13
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zyfCoCjsyFFnpwn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zyfCoCjsyFFnpwn
+.L_small_initial_partial_block_zyfCoCjsyFFnpwn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zyfCoCjsyFFnpwn:
+
+ orq %r8,%r8
+ je .L_after_reduction_zyfCoCjsyFFnpwn
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_zyfCoCjsyFFnpwn:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_10_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %ymm15,%ymm4,%ymm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %ymm10,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %ymm29,%ymm4,%ymm10
+ vextracti32x4 $1,%zmm10,%xmm13
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GlGwjupayCEmAmk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GlGwjupayCEmAmk
+.L_small_initial_partial_block_GlGwjupayCEmAmk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GlGwjupayCEmAmk:
+
+ orq %r8,%r8
+ je .L_after_reduction_GlGwjupayCEmAmk
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_GlGwjupayCEmAmk:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_11_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vextracti32x4 $2,%zmm10,%xmm13
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AedaxoBdGfervsb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AedaxoBdGfervsb
+.L_small_initial_partial_block_AedaxoBdGfervsb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AedaxoBdGfervsb:
+
+ orq %r8,%r8
+ je .L_after_reduction_AedaxoBdGfervsb
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_AedaxoBdGfervsb:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_12_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vextracti32x4 $3,%zmm10,%xmm13
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zfkGparhhvDqahn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zfkGparhhvDqahn
+.L_small_initial_partial_block_zfkGparhhvDqahn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zfkGparhhvDqahn:
+
+ orq %r8,%r8
+ je .L_after_reduction_zfkGparhhvDqahn
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_zfkGparhhvDqahn:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_13_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %xmm11,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %xmm29,%xmm5,%xmm11
+ vextracti32x4 $0,%zmm11,%xmm13
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uDsrwxuwAvaluno
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uDsrwxuwAvaluno
+.L_small_initial_partial_block_uDsrwxuwAvaluno:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uDsrwxuwAvaluno:
+
+ orq %r8,%r8
+ je .L_after_reduction_uDsrwxuwAvaluno
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_uDsrwxuwAvaluno:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_14_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %ymm15,%ymm5,%ymm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %ymm11,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %ymm29,%ymm5,%ymm11
+ vextracti32x4 $1,%zmm11,%xmm13
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_awnsCplrcfgEbDA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_awnsCplrcfgEbDA
+.L_small_initial_partial_block_awnsCplrcfgEbDA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_awnsCplrcfgEbDA:
+
+ orq %r8,%r8
+ je .L_after_reduction_awnsCplrcfgEbDA
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_awnsCplrcfgEbDA:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_15_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %zmm29,%zmm5,%zmm11
+ vextracti32x4 $2,%zmm11,%xmm13
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hgEBfdDtdFvGqjb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hgEBfdDtdFvGqjb
+.L_small_initial_partial_block_hgEBfdDtdFvGqjb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hgEBfdDtdFvGqjb:
+
+ orq %r8,%r8
+ je .L_after_reduction_hgEBfdDtdFvGqjb
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_hgEBfdDtdFvGqjb:
+ jmp .L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz
+.L_small_initial_num_blocks_is_16_tpcppgjkDAAGbmz:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %zmm29,%zmm5,%zmm11
+ vextracti32x4 $3,%zmm11,%xmm13
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_mbufndcrlyapBCF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mbufndcrlyapBCF:
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_mbufndcrlyapBCF:
+.L_small_initial_blocks_encrypted_tpcppgjkDAAGbmz:
+.L_ghash_done_tFbkipsuzBAeEGF:
+ vmovdqu64 %xmm2,0(%rsi)
+ vmovdqu64 %xmm14,64(%rsi)
+.L_enc_dec_done_tFbkipsuzBAeEGF:
+ jmp .Lexit_gcm_encrypt
+.align 32
+.Laes_gcm_encrypt_256_avx512:
+ orq %r8,%r8
+ je .L_enc_dec_done_eawnuBpGmxcBoDC
+ xorq %r14,%r14
+ vmovdqu64 64(%rsi),%xmm14
+
+ movq (%rdx),%r11
+ orq %r11,%r11
+ je .L_partial_block_done_yomlCiqlqyhGbxA
+ movl $16,%r10d
+ leaq byte_len_to_mask_table(%rip),%r12
+ cmpq %r10,%r8
+ cmovcq %r8,%r10
+ kmovw (%r12,%r10,2),%k1
+ vmovdqu8 (%rcx),%xmm0{%k1}{z}
+
+ vmovdqu64 16(%rsi),%xmm3
+ vmovdqu64 336(%rsi),%xmm4
+
+
+
+ leaq SHIFT_MASK(%rip),%r12
+ addq %r11,%r12
+ vmovdqu64 (%r12),%xmm5
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpxorq %xmm0,%xmm3,%xmm3
+
+
+ leaq (%r8,%r11,1),%r13
+ subq $16,%r13
+ jge .L_no_extra_mask_yomlCiqlqyhGbxA
+ subq %r13,%r12
+.L_no_extra_mask_yomlCiqlqyhGbxA:
+
+
+
+ vmovdqu64 16(%r12),%xmm0
+ vpand %xmm0,%xmm3,%xmm3
+ vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm14,%xmm14
+ cmpq $0,%r13
+ jl .L_partial_incomplete_yomlCiqlqyhGbxA
+
+ vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
+ vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
+ vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
+ vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm14,%xmm14
+
+ vpsrldq $8,%xmm14,%xmm11
+ vpslldq $8,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm7,%xmm7
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm11
+
+ vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
+ vpslldq $8,%xmm10,%xmm10
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
+ vpsrldq $4,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+
+ vpternlogq $0x96,%xmm10,%xmm7,%xmm14
+
+ movq $0,(%rdx)
+
+ movq %r11,%r12
+ movq $16,%r11
+ subq %r12,%r11
+ jmp .L_enc_dec_done_yomlCiqlqyhGbxA
+
+.L_partial_incomplete_yomlCiqlqyhGbxA:
+ addq %r8,(%rdx)
+ movq %r8,%r11
+
+.L_enc_dec_done_yomlCiqlqyhGbxA:
+
+
+ leaq byte_len_to_mask_table(%rip),%r12
+ kmovw (%r12,%r11,2),%k1
+ vmovdqu64 %xmm14,64(%rsi)
+
+ vpshufb SHUF_MASK(%rip),%xmm3,%xmm3
+ vpshufb %xmm5,%xmm3,%xmm3
+ movq %r9,%r12
+ vmovdqu8 %xmm3,(%r12){%k1}
+.L_partial_block_done_yomlCiqlqyhGbxA:
+ vmovdqu64 0(%rsi),%xmm2
+ subq %r11,%r8
+ je .L_enc_dec_done_eawnuBpGmxcBoDC
+ cmpq $256,%r8
+ jbe .L_message_below_equal_16_blocks_eawnuBpGmxcBoDC
+
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vmovdqa64 ddq_addbe_4444(%rip),%zmm27
+ vmovdqa64 ddq_addbe_1234(%rip),%zmm28
+
+
+
+
+
+
+ vmovd %xmm2,%r15d
+ andl $255,%r15d
+
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpshufb %zmm29,%zmm2,%zmm2
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_iqGewgDgqvuhkra
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_iqGewgDgqvuhkra
+.L_next_16_overflow_iqGewgDgqvuhkra:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_iqGewgDgqvuhkra:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm0
+ vmovdqu8 64(%rcx,%r11,1),%zmm3
+ vmovdqu8 128(%rcx,%r11,1),%zmm4
+ vmovdqu8 192(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 208(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 224(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,0(%r10,%r11,1)
+ vmovdqu8 %zmm10,64(%r10,%r11,1)
+ vmovdqu8 %zmm11,128(%r10,%r11,1)
+ vmovdqu8 %zmm12,192(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+ vmovdqa64 %zmm7,768(%rsp)
+ vmovdqa64 %zmm10,832(%rsp)
+ vmovdqa64 %zmm11,896(%rsp)
+ vmovdqa64 %zmm12,960(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_alwniGiGuuwbdou
+
+ vmovdqu64 288(%rsi),%zmm0
+ vmovdqu64 %zmm0,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm3
+ vmovdqu64 %zmm3,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 160(%rsi),%zmm4
+ vmovdqu64 %zmm4,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm5
+ vmovdqu64 %zmm5,512(%rsp)
+.L_skip_hkeys_precomputation_alwniGiGuuwbdou:
+ cmpq $512,%r8
+ jb .L_message_below_32_blocks_eawnuBpGmxcBoDC
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_wkhDhbijnuGGCmD
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_wkhDhbijnuGGCmD
+.L_next_16_overflow_wkhDhbijnuGGCmD:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_wkhDhbijnuGGCmD:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm0
+ vmovdqu8 320(%rcx,%r11,1),%zmm3
+ vmovdqu8 384(%rcx,%r11,1),%zmm4
+ vmovdqu8 448(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 208(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 224(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,256(%r10,%r11,1)
+ vmovdqu8 %zmm10,320(%r10,%r11,1)
+ vmovdqu8 %zmm11,384(%r10,%r11,1)
+ vmovdqu8 %zmm12,448(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+ vmovdqa64 %zmm7,1024(%rsp)
+ vmovdqa64 %zmm10,1088(%rsp)
+ vmovdqa64 %zmm11,1152(%rsp)
+ vmovdqa64 %zmm12,1216(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_xuEcimfukbaBqDu
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,0(%rsp)
+.L_skip_hkeys_precomputation_xuEcimfukbaBqDu:
+ movq $1,%r14
+ addq $512,%r11
+ subq $512,%r8
+
+ cmpq $768,%r8
+ jb .L_no_more_big_nblocks_eawnuBpGmxcBoDC
+.L_encrypt_big_nblocks_eawnuBpGmxcBoDC:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_hsjyfxApibhdaao
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_hsjyfxApibhdaao
+.L_16_blocks_overflow_hsjyfxApibhdaao:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_hsjyfxApibhdaao:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_FyafAtAzhgGauwk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_FyafAtAzhgGauwk
+.L_16_blocks_overflow_FyafAtAzhgGauwk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_FyafAtAzhgGauwk:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_mshygnywvbAbxuk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_mshygnywvbAbxuk
+.L_16_blocks_overflow_mshygnywvbAbxuk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_mshygnywvbAbxuk:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 512(%rcx,%r11,1),%zmm17
+ vmovdqu8 576(%rcx,%r11,1),%zmm19
+ vmovdqu8 640(%rcx,%r11,1),%zmm20
+ vmovdqu8 704(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+
+
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpternlogq $0x96,%zmm15,%zmm12,%zmm6
+ vpxorq %zmm24,%zmm6,%zmm6
+ vpternlogq $0x96,%zmm10,%zmm13,%zmm7
+ vpxorq %zmm25,%zmm7,%zmm7
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vextracti64x4 $1,%zmm6,%ymm12
+ vpxorq %ymm12,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm12
+ vpxorq %xmm12,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm6
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,512(%r10,%r11,1)
+ vmovdqu8 %zmm3,576(%r10,%r11,1)
+ vmovdqu8 %zmm4,640(%r10,%r11,1)
+ vmovdqu8 %zmm5,704(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1024(%rsp)
+ vmovdqa64 %zmm3,1088(%rsp)
+ vmovdqa64 %zmm4,1152(%rsp)
+ vmovdqa64 %zmm5,1216(%rsp)
+ vmovdqa64 %zmm6,%zmm14
+
+ addq $768,%r11
+ subq $768,%r8
+ cmpq $768,%r8
+ jae .L_encrypt_big_nblocks_eawnuBpGmxcBoDC
+
+.L_no_more_big_nblocks_eawnuBpGmxcBoDC:
+
+ cmpq $512,%r8
+ jae .L_encrypt_32_blocks_eawnuBpGmxcBoDC
+
+ cmpq $256,%r8
+ jae .L_encrypt_16_blocks_eawnuBpGmxcBoDC
+.L_encrypt_0_blocks_ghash_32_eawnuBpGmxcBoDC:
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $256,%ebx
+ subl %r10d,%ebx
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ addl $256,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_CAikcjdGDugFfth
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_CAikcjdGDugFfth
+ jb .L_last_num_blocks_is_7_1_CAikcjdGDugFfth
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_CAikcjdGDugFfth
+ jb .L_last_num_blocks_is_11_9_CAikcjdGDugFfth
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_CAikcjdGDugFfth
+ ja .L_last_num_blocks_is_16_CAikcjdGDugFfth
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_CAikcjdGDugFfth
+ jmp .L_last_num_blocks_is_13_CAikcjdGDugFfth
+
+.L_last_num_blocks_is_11_9_CAikcjdGDugFfth:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_CAikcjdGDugFfth
+ ja .L_last_num_blocks_is_11_CAikcjdGDugFfth
+ jmp .L_last_num_blocks_is_9_CAikcjdGDugFfth
+
+.L_last_num_blocks_is_7_1_CAikcjdGDugFfth:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_CAikcjdGDugFfth
+ jb .L_last_num_blocks_is_3_1_CAikcjdGDugFfth
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_CAikcjdGDugFfth
+ je .L_last_num_blocks_is_6_CAikcjdGDugFfth
+ jmp .L_last_num_blocks_is_5_CAikcjdGDugFfth
+
+.L_last_num_blocks_is_3_1_CAikcjdGDugFfth:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_CAikcjdGDugFfth
+ je .L_last_num_blocks_is_2_CAikcjdGDugFfth
+.L_last_num_blocks_is_1_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_xFvljgxvqrrjiEx
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_xFvljgxvqrrjiEx
+
+.L_16_blocks_overflow_xFvljgxvqrrjiEx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_xFvljgxvqrrjiEx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qxurhxfinuxAakr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qxurhxfinuxAakr
+.L_small_initial_partial_block_qxurhxfinuxAakr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_qxurhxfinuxAakr
+.L_small_initial_compute_done_qxurhxfinuxAakr:
+.L_after_reduction_qxurhxfinuxAakr:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_2_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_jkwkgdBwnfqtmoz
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_jkwkgdBwnfqtmoz
+
+.L_16_blocks_overflow_jkwkgdBwnfqtmoz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_jkwkgdBwnfqtmoz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FuEgfclAfodbltt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FuEgfclAfodbltt
+.L_small_initial_partial_block_FuEgfclAfodbltt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FuEgfclAfodbltt:
+
+ orq %r8,%r8
+ je .L_after_reduction_FuEgfclAfodbltt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FuEgfclAfodbltt:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_3_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_rlpicECjalEogkA
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_rlpicECjalEogkA
+
+.L_16_blocks_overflow_rlpicECjalEogkA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_rlpicECjalEogkA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CuzDDhbEvttwEEk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CuzDDhbEvttwEEk
+.L_small_initial_partial_block_CuzDDhbEvttwEEk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CuzDDhbEvttwEEk:
+
+ orq %r8,%r8
+ je .L_after_reduction_CuzDDhbEvttwEEk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CuzDDhbEvttwEEk:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_4_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_gqkAClvbnegzAmA
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_gqkAClvbnegzAmA
+
+.L_16_blocks_overflow_gqkAClvbnegzAmA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_gqkAClvbnegzAmA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xcnzwhtrnbgDqfy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xcnzwhtrnbgDqfy
+.L_small_initial_partial_block_xcnzwhtrnbgDqfy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xcnzwhtrnbgDqfy:
+
+ orq %r8,%r8
+ je .L_after_reduction_xcnzwhtrnbgDqfy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xcnzwhtrnbgDqfy:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_5_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_FklAbbifjuDAcpD
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_FklAbbifjuDAcpD
+
+.L_16_blocks_overflow_FklAbbifjuDAcpD:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_FklAbbifjuDAcpD:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oxoctmohDgCBefA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oxoctmohDgCBefA
+.L_small_initial_partial_block_oxoctmohDgCBefA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oxoctmohDgCBefA:
+
+ orq %r8,%r8
+ je .L_after_reduction_oxoctmohDgCBefA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oxoctmohDgCBefA:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_6_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_odCCAydbBFAapzd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_odCCAydbBFAapzd
+
+.L_16_blocks_overflow_odCCAydbBFAapzd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_odCCAydbBFAapzd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qlwikcksldoilrG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qlwikcksldoilrG
+.L_small_initial_partial_block_qlwikcksldoilrG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qlwikcksldoilrG:
+
+ orq %r8,%r8
+ je .L_after_reduction_qlwikcksldoilrG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qlwikcksldoilrG:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_7_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_mjwDlmhvzElddng
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_mjwDlmhvzElddng
+
+.L_16_blocks_overflow_mjwDlmhvzElddng:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_mjwDlmhvzElddng:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_unqgfDFcvabkGta
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_unqgfDFcvabkGta
+.L_small_initial_partial_block_unqgfDFcvabkGta:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_unqgfDFcvabkGta:
+
+ orq %r8,%r8
+ je .L_after_reduction_unqgfDFcvabkGta
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_unqgfDFcvabkGta:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_8_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_EinBcyEEyChknsj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_EinBcyEEyChknsj
+
+.L_16_blocks_overflow_EinBcyEEyChknsj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_EinBcyEEyChknsj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ejuhaaqjamhcjqF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ejuhaaqjamhcjqF
+.L_small_initial_partial_block_ejuhaaqjamhcjqF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ejuhaaqjamhcjqF:
+
+ orq %r8,%r8
+ je .L_after_reduction_ejuhaaqjamhcjqF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ejuhaaqjamhcjqF:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_9_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_mhxEmCxxjyDqdDo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_mhxEmCxxjyDqdDo
+
+.L_16_blocks_overflow_mhxEmCxxjyDqdDo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_mhxEmCxxjyDqdDo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zdofzxhsAexptkx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zdofzxhsAexptkx
+.L_small_initial_partial_block_zdofzxhsAexptkx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zdofzxhsAexptkx:
+
+ orq %r8,%r8
+ je .L_after_reduction_zdofzxhsAexptkx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zdofzxhsAexptkx:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_10_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_rvskGvkumwEhhsc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_rvskGvkumwEhhsc
+
+.L_16_blocks_overflow_rvskGvkumwEhhsc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_rvskGvkumwEhhsc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gngjmGDkBquyveG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gngjmGDkBquyveG
+.L_small_initial_partial_block_gngjmGDkBquyveG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gngjmGDkBquyveG:
+
+ orq %r8,%r8
+ je .L_after_reduction_gngjmGDkBquyveG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gngjmGDkBquyveG:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_11_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_Dtnnktpbavbarsp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_Dtnnktpbavbarsp
+
+.L_16_blocks_overflow_Dtnnktpbavbarsp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_Dtnnktpbavbarsp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xfvylkhgAonGlpn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xfvylkhgAonGlpn
+.L_small_initial_partial_block_xfvylkhgAonGlpn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xfvylkhgAonGlpn:
+
+ orq %r8,%r8
+ je .L_after_reduction_xfvylkhgAonGlpn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xfvylkhgAonGlpn:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_12_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_bpklztjgEEdhFxz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_bpklztjgEEdhFxz
+
+.L_16_blocks_overflow_bpklztjgEEdhFxz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_bpklztjgEEdhFxz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dgtbwzqgvnDyDmt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dgtbwzqgvnDyDmt
+.L_small_initial_partial_block_dgtbwzqgvnDyDmt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dgtbwzqgvnDyDmt:
+
+ orq %r8,%r8
+ je .L_after_reduction_dgtbwzqgvnDyDmt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dgtbwzqgvnDyDmt:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_13_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_BBkhDhGlvcaehas
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_BBkhDhGlvcaehas
+
+.L_16_blocks_overflow_BBkhDhGlvcaehas:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_BBkhDhGlvcaehas:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_euhapEbhfhxemzw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_euhapEbhfhxemzw
+.L_small_initial_partial_block_euhapEbhfhxemzw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_euhapEbhfhxemzw:
+
+ orq %r8,%r8
+ je .L_after_reduction_euhapEbhfhxemzw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_euhapEbhfhxemzw:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_14_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_wFmlAewyxkiABzu
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_wFmlAewyxkiABzu
+
+.L_16_blocks_overflow_wFmlAewyxkiABzu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_wFmlAewyxkiABzu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xleiaowmorzhxfq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xleiaowmorzhxfq
+.L_small_initial_partial_block_xleiaowmorzhxfq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xleiaowmorzhxfq:
+
+ orq %r8,%r8
+ je .L_after_reduction_xleiaowmorzhxfq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xleiaowmorzhxfq:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_15_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_fwmFnlmCbhngvtq
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_fwmFnlmCbhngvtq
+
+.L_16_blocks_overflow_fwmFnlmCbhngvtq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_fwmFnlmCbhngvtq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_Cwwewmiesghaixp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_Cwwewmiesghaixp
+.L_small_initial_partial_block_Cwwewmiesghaixp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_Cwwewmiesghaixp:
+
+ orq %r8,%r8
+ je .L_after_reduction_Cwwewmiesghaixp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_Cwwewmiesghaixp:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_16_CAikcjdGDugFfth:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_xEdGzjmGszadGFy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_xEdGzjmGszadGFy
+
+.L_16_blocks_overflow_xEdGzjmGszadGFy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_xEdGzjmGszadGFy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_fphazgGgmEuxiEi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fphazgGgmEuxiEi:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fphazgGgmEuxiEi:
+ jmp .L_last_blocks_done_CAikcjdGDugFfth
+.L_last_num_blocks_is_0_CAikcjdGDugFfth:
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_CAikcjdGDugFfth:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_eawnuBpGmxcBoDC
+.L_encrypt_32_blocks_eawnuBpGmxcBoDC:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_fxEfrxCahjuywkw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_fxEfrxCahjuywkw
+.L_16_blocks_overflow_fxEfrxCahjuywkw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_fxEfrxCahjuywkw:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_iwxfgjgfFyEczhg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_iwxfgjgfFyEczhg
+.L_16_blocks_overflow_iwxfgjgfFyEczhg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_iwxfgjgfFyEczhg:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+ subq $512,%r8
+ addq $512,%r11
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_muvbsvrgtnhDwuC
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_muvbsvrgtnhDwuC
+ jb .L_last_num_blocks_is_7_1_muvbsvrgtnhDwuC
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_muvbsvrgtnhDwuC
+ jb .L_last_num_blocks_is_11_9_muvbsvrgtnhDwuC
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_muvbsvrgtnhDwuC
+ ja .L_last_num_blocks_is_16_muvbsvrgtnhDwuC
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_muvbsvrgtnhDwuC
+ jmp .L_last_num_blocks_is_13_muvbsvrgtnhDwuC
+
+.L_last_num_blocks_is_11_9_muvbsvrgtnhDwuC:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_muvbsvrgtnhDwuC
+ ja .L_last_num_blocks_is_11_muvbsvrgtnhDwuC
+ jmp .L_last_num_blocks_is_9_muvbsvrgtnhDwuC
+
+.L_last_num_blocks_is_7_1_muvbsvrgtnhDwuC:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_muvbsvrgtnhDwuC
+ jb .L_last_num_blocks_is_3_1_muvbsvrgtnhDwuC
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_muvbsvrgtnhDwuC
+ je .L_last_num_blocks_is_6_muvbsvrgtnhDwuC
+ jmp .L_last_num_blocks_is_5_muvbsvrgtnhDwuC
+
+.L_last_num_blocks_is_3_1_muvbsvrgtnhDwuC:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_muvbsvrgtnhDwuC
+ je .L_last_num_blocks_is_2_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_1_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_sCioAEgxkAkBsms
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_sCioAEgxkAkBsms
+
+.L_16_blocks_overflow_sCioAEgxkAkBsms:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_sCioAEgxkAkBsms:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_iuEEnvAblnyuBEp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_iuEEnvAblnyuBEp
+.L_small_initial_partial_block_iuEEnvAblnyuBEp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_iuEEnvAblnyuBEp
+.L_small_initial_compute_done_iuEEnvAblnyuBEp:
+.L_after_reduction_iuEEnvAblnyuBEp:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_2_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_syraAlmuhpzefuz
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_syraAlmuhpzefuz
+
+.L_16_blocks_overflow_syraAlmuhpzefuz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_syraAlmuhpzefuz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wklxqcsAiCzEeze
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wklxqcsAiCzEeze
+.L_small_initial_partial_block_wklxqcsAiCzEeze:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wklxqcsAiCzEeze:
+
+ orq %r8,%r8
+ je .L_after_reduction_wklxqcsAiCzEeze
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wklxqcsAiCzEeze:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_3_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_iccrdFDrrokpmyB
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_iccrdFDrrokpmyB
+
+.L_16_blocks_overflow_iccrdFDrrokpmyB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_iccrdFDrrokpmyB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ohaugBufhhdgdDo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ohaugBufhhdgdDo
+.L_small_initial_partial_block_ohaugBufhhdgdDo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ohaugBufhhdgdDo:
+
+ orq %r8,%r8
+ je .L_after_reduction_ohaugBufhhdgdDo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ohaugBufhhdgdDo:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_4_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_jkieEplbtgwkEgk
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_jkieEplbtgwkEgk
+
+.L_16_blocks_overflow_jkieEplbtgwkEgk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_jkieEplbtgwkEgk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_omkzepGnFhlDsok
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_omkzepGnFhlDsok
+.L_small_initial_partial_block_omkzepGnFhlDsok:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_omkzepGnFhlDsok:
+
+ orq %r8,%r8
+ je .L_after_reduction_omkzepGnFhlDsok
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_omkzepGnFhlDsok:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_5_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_vtnqanBpwpcCkvb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_vtnqanBpwpcCkvb
+
+.L_16_blocks_overflow_vtnqanBpwpcCkvb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_vtnqanBpwpcCkvb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DiateEzAgclciak
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DiateEzAgclciak
+.L_small_initial_partial_block_DiateEzAgclciak:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DiateEzAgclciak:
+
+ orq %r8,%r8
+ je .L_after_reduction_DiateEzAgclciak
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DiateEzAgclciak:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_6_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_oakjAwsnClAznod
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_oakjAwsnClAznod
+
+.L_16_blocks_overflow_oakjAwsnClAznod:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_oakjAwsnClAznod:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oqCwqiEfmwxEduu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oqCwqiEfmwxEduu
+.L_small_initial_partial_block_oqCwqiEfmwxEduu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oqCwqiEfmwxEduu:
+
+ orq %r8,%r8
+ je .L_after_reduction_oqCwqiEfmwxEduu
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oqCwqiEfmwxEduu:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_7_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_lhrubptnEwwxvoi
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_lhrubptnEwwxvoi
+
+.L_16_blocks_overflow_lhrubptnEwwxvoi:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_lhrubptnEwwxvoi:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lyGDbaegdAnFgEy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lyGDbaegdAnFgEy
+.L_small_initial_partial_block_lyGDbaegdAnFgEy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lyGDbaegdAnFgEy:
+
+ orq %r8,%r8
+ je .L_after_reduction_lyGDbaegdAnFgEy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lyGDbaegdAnFgEy:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_8_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_umvkbciEsdgFrgg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_umvkbciEsdgFrgg
+
+.L_16_blocks_overflow_umvkbciEsdgFrgg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_umvkbciEsdgFrgg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ogfGBxxhhoalgtB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ogfGBxxhhoalgtB
+.L_small_initial_partial_block_ogfGBxxhhoalgtB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ogfGBxxhhoalgtB:
+
+ orq %r8,%r8
+ je .L_after_reduction_ogfGBxxhhoalgtB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ogfGBxxhhoalgtB:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_9_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_wFkatvuEtupbkGb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_wFkatvuEtupbkGb
+
+.L_16_blocks_overflow_wFkatvuEtupbkGb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_wFkatvuEtupbkGb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wkiizpjcpbzfFyj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wkiizpjcpbzfFyj
+.L_small_initial_partial_block_wkiizpjcpbzfFyj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wkiizpjcpbzfFyj:
+
+ orq %r8,%r8
+ je .L_after_reduction_wkiizpjcpbzfFyj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wkiizpjcpbzfFyj:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_10_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_ircelvtBaeuiwvC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_ircelvtBaeuiwvC
+
+.L_16_blocks_overflow_ircelvtBaeuiwvC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_ircelvtBaeuiwvC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pDtuuFvFlvjvrCz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pDtuuFvFlvjvrCz
+.L_small_initial_partial_block_pDtuuFvFlvjvrCz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pDtuuFvFlvjvrCz:
+
+ orq %r8,%r8
+ je .L_after_reduction_pDtuuFvFlvjvrCz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pDtuuFvFlvjvrCz:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_11_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_GozdsctAidzEqxd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_GozdsctAidzEqxd
+
+.L_16_blocks_overflow_GozdsctAidzEqxd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_GozdsctAidzEqxd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yrocgFvryFBiech
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yrocgFvryFBiech
+.L_small_initial_partial_block_yrocgFvryFBiech:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yrocgFvryFBiech:
+
+ orq %r8,%r8
+ je .L_after_reduction_yrocgFvryFBiech
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_yrocgFvryFBiech:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_12_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_kgvcyifhjuAglsm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_kgvcyifhjuAglsm
+
+.L_16_blocks_overflow_kgvcyifhjuAglsm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_kgvcyifhjuAglsm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oclBtelgDoBblti
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oclBtelgDoBblti
+.L_small_initial_partial_block_oclBtelgDoBblti:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oclBtelgDoBblti:
+
+ orq %r8,%r8
+ je .L_after_reduction_oclBtelgDoBblti
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oclBtelgDoBblti:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_13_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_GgsgulfrbGGFGGc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_GgsgulfrbGGFGGc
+
+.L_16_blocks_overflow_GgsgulfrbGGFGGc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_GgsgulfrbGGFGGc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bvEBvhpbxzwvDrk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bvEBvhpbxzwvDrk
+.L_small_initial_partial_block_bvEBvhpbxzwvDrk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bvEBvhpbxzwvDrk:
+
+ orq %r8,%r8
+ je .L_after_reduction_bvEBvhpbxzwvDrk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bvEBvhpbxzwvDrk:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_14_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_vejDBlGzdxbDGDE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_vejDBlGzdxbDGDE
+
+.L_16_blocks_overflow_vejDBlGzdxbDGDE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_vejDBlGzdxbDGDE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lvCGeChuoEvfnul
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lvCGeChuoEvfnul
+.L_small_initial_partial_block_lvCGeChuoEvfnul:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lvCGeChuoEvfnul:
+
+ orq %r8,%r8
+ je .L_after_reduction_lvCGeChuoEvfnul
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lvCGeChuoEvfnul:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_15_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_ytioEdspdkiwstn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_ytioEdspdkiwstn
+
+.L_16_blocks_overflow_ytioEdspdkiwstn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_ytioEdspdkiwstn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fxpoudCxsjlwBmb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fxpoudCxsjlwBmb
+.L_small_initial_partial_block_fxpoudCxsjlwBmb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fxpoudCxsjlwBmb:
+
+ orq %r8,%r8
+ je .L_after_reduction_fxpoudCxsjlwBmb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fxpoudCxsjlwBmb:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_16_muvbsvrgtnhDwuC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_ijwokgwDeCteCll
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_ijwokgwDeCteCll
+
+.L_16_blocks_overflow_ijwokgwDeCteCll:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_ijwokgwDeCteCll:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_rCCuFewyfDAEddb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rCCuFewyfDAEddb:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rCCuFewyfDAEddb:
+ jmp .L_last_blocks_done_muvbsvrgtnhDwuC
+.L_last_num_blocks_is_0_muvbsvrgtnhDwuC:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_muvbsvrgtnhDwuC:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_eawnuBpGmxcBoDC
+.L_encrypt_16_blocks_eawnuBpGmxcBoDC:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_nAxplcgfimbFyBh
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_nAxplcgfimbFyBh
+.L_16_blocks_overflow_nAxplcgfimbFyBh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_nAxplcgfimbFyBh:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 256(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 320(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 384(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 448(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_gFFyhgntvwxgCvF
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_gFFyhgntvwxgCvF
+ jb .L_last_num_blocks_is_7_1_gFFyhgntvwxgCvF
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_gFFyhgntvwxgCvF
+ jb .L_last_num_blocks_is_11_9_gFFyhgntvwxgCvF
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_gFFyhgntvwxgCvF
+ ja .L_last_num_blocks_is_16_gFFyhgntvwxgCvF
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_gFFyhgntvwxgCvF
+ jmp .L_last_num_blocks_is_13_gFFyhgntvwxgCvF
+
+.L_last_num_blocks_is_11_9_gFFyhgntvwxgCvF:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_gFFyhgntvwxgCvF
+ ja .L_last_num_blocks_is_11_gFFyhgntvwxgCvF
+ jmp .L_last_num_blocks_is_9_gFFyhgntvwxgCvF
+
+.L_last_num_blocks_is_7_1_gFFyhgntvwxgCvF:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_gFFyhgntvwxgCvF
+ jb .L_last_num_blocks_is_3_1_gFFyhgntvwxgCvF
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_gFFyhgntvwxgCvF
+ je .L_last_num_blocks_is_6_gFFyhgntvwxgCvF
+ jmp .L_last_num_blocks_is_5_gFFyhgntvwxgCvF
+
+.L_last_num_blocks_is_3_1_gFFyhgntvwxgCvF:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_gFFyhgntvwxgCvF
+ je .L_last_num_blocks_is_2_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_1_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_edqyFiqozsDenuz
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_edqyFiqozsDenuz
+
+.L_16_blocks_overflow_edqyFiqozsDenuz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_edqyFiqozsDenuz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %xmm31,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hxBDgFwdGwbthCy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hxBDgFwdGwbthCy
+.L_small_initial_partial_block_hxBDgFwdGwbthCy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_hxBDgFwdGwbthCy
+.L_small_initial_compute_done_hxBDgFwdGwbthCy:
+.L_after_reduction_hxBDgFwdGwbthCy:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_2_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_uyuBmtkqzsrxAjG
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_uyuBmtkqzsrxAjG
+
+.L_16_blocks_overflow_uyuBmtkqzsrxAjG:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_uyuBmtkqzsrxAjG:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %ymm31,%ymm0,%ymm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DnwnjmmqBtjmtxy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DnwnjmmqBtjmtxy
+.L_small_initial_partial_block_DnwnjmmqBtjmtxy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DnwnjmmqBtjmtxy:
+
+ orq %r8,%r8
+ je .L_after_reduction_DnwnjmmqBtjmtxy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DnwnjmmqBtjmtxy:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_3_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_mayxFbwAyisdwiE
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_mayxFbwAyisdwiE
+
+.L_16_blocks_overflow_mayxFbwAyisdwiE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_mayxFbwAyisdwiE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sFnrdciEorxGldB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sFnrdciEorxGldB
+.L_small_initial_partial_block_sFnrdciEorxGldB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sFnrdciEorxGldB:
+
+ orq %r8,%r8
+ je .L_after_reduction_sFnrdciEorxGldB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_sFnrdciEorxGldB:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_4_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_cahBhluzDpDniBC
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_cahBhluzDpDniBC
+
+.L_16_blocks_overflow_cahBhluzDpDniBC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_cahBhluzDpDniBC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_flBuFDkGEouCjry
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_flBuFDkGEouCjry
+.L_small_initial_partial_block_flBuFDkGEouCjry:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_flBuFDkGEouCjry:
+
+ orq %r8,%r8
+ je .L_after_reduction_flBuFDkGEouCjry
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_flBuFDkGEouCjry:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_5_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_dogBbFBCkktqmfE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_dogBbFBCkktqmfE
+
+.L_16_blocks_overflow_dogBbFBCkktqmfE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_dogBbFBCkktqmfE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BcpothbedDEfeoC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BcpothbedDEfeoC
+.L_small_initial_partial_block_BcpothbedDEfeoC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BcpothbedDEfeoC:
+
+ orq %r8,%r8
+ je .L_after_reduction_BcpothbedDEfeoC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BcpothbedDEfeoC:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_6_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_oGartozfntEBpal
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_oGartozfntEBpal
+
+.L_16_blocks_overflow_oGartozfntEBpal:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_oGartozfntEBpal:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rwznrbbsqxwaCko
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rwznrbbsqxwaCko
+.L_small_initial_partial_block_rwznrbbsqxwaCko:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rwznrbbsqxwaCko:
+
+ orq %r8,%r8
+ je .L_after_reduction_rwznrbbsqxwaCko
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rwznrbbsqxwaCko:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_7_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_EBiardhujGzcrlk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_EBiardhujGzcrlk
+
+.L_16_blocks_overflow_EBiardhujGzcrlk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_EBiardhujGzcrlk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tnvletidFAfbEDF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tnvletidFAfbEDF
+.L_small_initial_partial_block_tnvletidFAfbEDF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tnvletidFAfbEDF:
+
+ orq %r8,%r8
+ je .L_after_reduction_tnvletidFAfbEDF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tnvletidFAfbEDF:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_8_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_iumqnFogzhcrGGw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_iumqnFogzhcrGGw
+
+.L_16_blocks_overflow_iumqnFogzhcrGGw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_iumqnFogzhcrGGw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qEzaCAhsCAiFoFG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qEzaCAhsCAiFoFG
+.L_small_initial_partial_block_qEzaCAhsCAiFoFG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qEzaCAhsCAiFoFG:
+
+ orq %r8,%r8
+ je .L_after_reduction_qEzaCAhsCAiFoFG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qEzaCAhsCAiFoFG:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_9_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_uerldGeDtdqniAd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_uerldGeDtdqniAd
+
+.L_16_blocks_overflow_uerldGeDtdqniAd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_uerldGeDtdqniAd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aaFGCaaBiGmkrxE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aaFGCaaBiGmkrxE
+.L_small_initial_partial_block_aaFGCaaBiGmkrxE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_aaFGCaaBiGmkrxE:
+
+ orq %r8,%r8
+ je .L_after_reduction_aaFGCaaBiGmkrxE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_aaFGCaaBiGmkrxE:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_10_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_Aozpqcpomafvkzu
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_Aozpqcpomafvkzu
+
+.L_16_blocks_overflow_Aozpqcpomafvkzu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_Aozpqcpomafvkzu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dahhcFmAhdipFgB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dahhcFmAhdipFgB
+.L_small_initial_partial_block_dahhcFmAhdipFgB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dahhcFmAhdipFgB:
+
+ orq %r8,%r8
+ je .L_after_reduction_dahhcFmAhdipFgB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dahhcFmAhdipFgB:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_11_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_EgocqAvvFflyEjg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_EgocqAvvFflyEjg
+
+.L_16_blocks_overflow_EgocqAvvFflyEjg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_EgocqAvvFflyEjg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BgCerdsyeobnbbs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BgCerdsyeobnbbs
+.L_small_initial_partial_block_BgCerdsyeobnbbs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BgCerdsyeobnbbs:
+
+ orq %r8,%r8
+ je .L_after_reduction_BgCerdsyeobnbbs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BgCerdsyeobnbbs:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_12_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_cydmoiBEzigfGjF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_cydmoiBEzigfGjF
+
+.L_16_blocks_overflow_cydmoiBEzigfGjF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_cydmoiBEzigfGjF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cDdypaAhkmGvFrB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cDdypaAhkmGvFrB
+.L_small_initial_partial_block_cDdypaAhkmGvFrB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cDdypaAhkmGvFrB:
+
+ orq %r8,%r8
+ je .L_after_reduction_cDdypaAhkmGvFrB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_cDdypaAhkmGvFrB:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_13_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_cGnAhGixtCoyetC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_cGnAhGixtCoyetC
+
+.L_16_blocks_overflow_cGnAhGixtCoyetC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_cGnAhGixtCoyetC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FeGcnwBvApiyeqj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FeGcnwBvApiyeqj
+.L_small_initial_partial_block_FeGcnwBvApiyeqj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FeGcnwBvApiyeqj:
+
+ orq %r8,%r8
+ je .L_after_reduction_FeGcnwBvApiyeqj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FeGcnwBvApiyeqj:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_14_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_iftBfEFqGGBvyjm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_iftBfEFqGGBvyjm
+
+.L_16_blocks_overflow_iftBfEFqGGBvyjm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_iftBfEFqGGBvyjm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oihhuqgdwBFgleb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oihhuqgdwBFgleb
+.L_small_initial_partial_block_oihhuqgdwBFgleb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oihhuqgdwBFgleb:
+
+ orq %r8,%r8
+ je .L_after_reduction_oihhuqgdwBFgleb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oihhuqgdwBFgleb:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_15_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_fvupeAvimjnmGoe
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_fvupeAvimjnmGoe
+
+.L_16_blocks_overflow_fvupeAvimjnmGoe:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_fvupeAvimjnmGoe:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rrptnxnCqernCsp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rrptnxnCqernCsp
+.L_small_initial_partial_block_rrptnxnCqernCsp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rrptnxnCqernCsp:
+
+ orq %r8,%r8
+ je .L_after_reduction_rrptnxnCqernCsp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rrptnxnCqernCsp:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_16_gFFyhgntvwxgCvF:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_wGkryszirehgiqf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_wGkryszirehgiqf
+
+.L_16_blocks_overflow_wGkryszirehgiqf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_wGkryszirehgiqf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_ylCxcFDbnxrlyjy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ylCxcFDbnxrlyjy:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ylCxcFDbnxrlyjy:
+ jmp .L_last_blocks_done_gFFyhgntvwxgCvF
+.L_last_num_blocks_is_0_gFFyhgntvwxgCvF:
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_gFFyhgntvwxgCvF:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_eawnuBpGmxcBoDC
+
+.L_message_below_32_blocks_eawnuBpGmxcBoDC:
+
+
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_cyGhsoclCDuqust
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+.L_skip_hkeys_precomputation_cyGhsoclCDuqust:
+ movq $1,%r14
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_gmjFjaoGnEhAquD
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_gmjFjaoGnEhAquD
+ jb .L_last_num_blocks_is_7_1_gmjFjaoGnEhAquD
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_gmjFjaoGnEhAquD
+ jb .L_last_num_blocks_is_11_9_gmjFjaoGnEhAquD
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_gmjFjaoGnEhAquD
+ ja .L_last_num_blocks_is_16_gmjFjaoGnEhAquD
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_gmjFjaoGnEhAquD
+ jmp .L_last_num_blocks_is_13_gmjFjaoGnEhAquD
+
+.L_last_num_blocks_is_11_9_gmjFjaoGnEhAquD:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_gmjFjaoGnEhAquD
+ ja .L_last_num_blocks_is_11_gmjFjaoGnEhAquD
+ jmp .L_last_num_blocks_is_9_gmjFjaoGnEhAquD
+
+.L_last_num_blocks_is_7_1_gmjFjaoGnEhAquD:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_gmjFjaoGnEhAquD
+ jb .L_last_num_blocks_is_3_1_gmjFjaoGnEhAquD
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_gmjFjaoGnEhAquD
+ je .L_last_num_blocks_is_6_gmjFjaoGnEhAquD
+ jmp .L_last_num_blocks_is_5_gmjFjaoGnEhAquD
+
+.L_last_num_blocks_is_3_1_gmjFjaoGnEhAquD:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_gmjFjaoGnEhAquD
+ je .L_last_num_blocks_is_2_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_1_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_lmprlxqohayAaff
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_lmprlxqohayAaff
+
+.L_16_blocks_overflow_lmprlxqohayAaff:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_lmprlxqohayAaff:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ycnbantiDaoGCva
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ycnbantiDaoGCva
+.L_small_initial_partial_block_ycnbantiDaoGCva:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_ycnbantiDaoGCva
+.L_small_initial_compute_done_ycnbantiDaoGCva:
+.L_after_reduction_ycnbantiDaoGCva:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_2_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_FmnmcFgtBcispji
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_FmnmcFgtBcispji
+
+.L_16_blocks_overflow_FmnmcFgtBcispji:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_FmnmcFgtBcispji:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AtjvciobwAfsBgo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AtjvciobwAfsBgo
+.L_small_initial_partial_block_AtjvciobwAfsBgo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AtjvciobwAfsBgo:
+
+ orq %r8,%r8
+ je .L_after_reduction_AtjvciobwAfsBgo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AtjvciobwAfsBgo:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_3_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_tgAkxvFFocitubl
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_tgAkxvFFocitubl
+
+.L_16_blocks_overflow_tgAkxvFFocitubl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_tgAkxvFFocitubl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_siwDojaimuxlcux
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_siwDojaimuxlcux
+.L_small_initial_partial_block_siwDojaimuxlcux:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_siwDojaimuxlcux:
+
+ orq %r8,%r8
+ je .L_after_reduction_siwDojaimuxlcux
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_siwDojaimuxlcux:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_4_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_AaBBmAybFatffyg
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_AaBBmAybFatffyg
+
+.L_16_blocks_overflow_AaBBmAybFatffyg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_AaBBmAybFatffyg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xhaBeCiyfAeqaBf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xhaBeCiyfAeqaBf
+.L_small_initial_partial_block_xhaBeCiyfAeqaBf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xhaBeCiyfAeqaBf:
+
+ orq %r8,%r8
+ je .L_after_reduction_xhaBeCiyfAeqaBf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xhaBeCiyfAeqaBf:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_5_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_akmmkrkgrAtqDyf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_akmmkrkgrAtqDyf
+
+.L_16_blocks_overflow_akmmkrkgrAtqDyf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_akmmkrkgrAtqDyf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %xmm29,%xmm3,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xqhfeyAhltlBsyF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xqhfeyAhltlBsyF
+.L_small_initial_partial_block_xqhfeyAhltlBsyF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xqhfeyAhltlBsyF:
+
+ orq %r8,%r8
+ je .L_after_reduction_xqhfeyAhltlBsyF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xqhfeyAhltlBsyF:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_6_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_vuckCplCqacsnkw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_vuckCplCqacsnkw
+
+.L_16_blocks_overflow_vuckCplCqacsnkw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_vuckCplCqacsnkw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %ymm29,%ymm3,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ruAuuqlioaFhuzd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ruAuuqlioaFhuzd
+.L_small_initial_partial_block_ruAuuqlioaFhuzd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ruAuuqlioaFhuzd:
+
+ orq %r8,%r8
+ je .L_after_reduction_ruAuuqlioaFhuzd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ruAuuqlioaFhuzd:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_7_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_vxwemaBiapgApmr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_vxwemaBiapgApmr
+
+.L_16_blocks_overflow_vxwemaBiapgApmr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_vxwemaBiapgApmr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wdpAcmnbkmzzufl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wdpAcmnbkmzzufl
+.L_small_initial_partial_block_wdpAcmnbkmzzufl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wdpAcmnbkmzzufl:
+
+ orq %r8,%r8
+ je .L_after_reduction_wdpAcmnbkmzzufl
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wdpAcmnbkmzzufl:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_8_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_kuexuhgEceqggje
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_kuexuhgEceqggje
+
+.L_16_blocks_overflow_kuexuhgEceqggje:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_kuexuhgEceqggje:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tvzmBcComjdtAzn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tvzmBcComjdtAzn
+.L_small_initial_partial_block_tvzmBcComjdtAzn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tvzmBcComjdtAzn:
+
+ orq %r8,%r8
+ je .L_after_reduction_tvzmBcComjdtAzn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tvzmBcComjdtAzn:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_9_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_npAFwfijqmcuehu
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_npAFwfijqmcuehu
+
+.L_16_blocks_overflow_npAFwfijqmcuehu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_npAFwfijqmcuehu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %xmm29,%xmm4,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gxddwsBBhjrmGda
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gxddwsBBhjrmGda
+.L_small_initial_partial_block_gxddwsBBhjrmGda:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gxddwsBBhjrmGda:
+
+ orq %r8,%r8
+ je .L_after_reduction_gxddwsBBhjrmGda
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gxddwsBBhjrmGda:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_10_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_hvAwbmhkGhGravm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_hvAwbmhkGhGravm
+
+.L_16_blocks_overflow_hvAwbmhkGhGravm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_hvAwbmhkGhGravm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %ymm29,%ymm4,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bjwDcmjtGlgmwEb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bjwDcmjtGlgmwEb
+.L_small_initial_partial_block_bjwDcmjtGlgmwEb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bjwDcmjtGlgmwEb:
+
+ orq %r8,%r8
+ je .L_after_reduction_bjwDcmjtGlgmwEb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bjwDcmjtGlgmwEb:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_11_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_BhqdCBAEnwmDwhl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_BhqdCBAEnwmDwhl
+
+.L_16_blocks_overflow_BhqdCBAEnwmDwhl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_BhqdCBAEnwmDwhl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ipuaxhAChCElalm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ipuaxhAChCElalm
+.L_small_initial_partial_block_ipuaxhAChCElalm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ipuaxhAChCElalm:
+
+ orq %r8,%r8
+ je .L_after_reduction_ipuaxhAChCElalm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ipuaxhAChCElalm:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_12_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_ckykbBijvpyDxDm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_ckykbBijvpyDxDm
+
+.L_16_blocks_overflow_ckykbBijvpyDxDm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_ckykbBijvpyDxDm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mkzFsudzBDhjcvh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mkzFsudzBDhjcvh
+.L_small_initial_partial_block_mkzFsudzBDhjcvh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mkzFsudzBDhjcvh:
+
+ orq %r8,%r8
+ je .L_after_reduction_mkzFsudzBDhjcvh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_mkzFsudzBDhjcvh:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_13_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_DjGBFpAkClvxnAD
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_DjGBFpAkClvxnAD
+
+.L_16_blocks_overflow_DjGBFpAkClvxnAD:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_DjGBFpAkClvxnAD:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %xmm29,%xmm5,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lygCkeDknmvaExs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lygCkeDknmvaExs
+.L_small_initial_partial_block_lygCkeDknmvaExs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lygCkeDknmvaExs:
+
+ orq %r8,%r8
+ je .L_after_reduction_lygCkeDknmvaExs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lygCkeDknmvaExs:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_14_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_mxbEwfimcnwvdax
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_mxbEwfimcnwvdax
+
+.L_16_blocks_overflow_mxbEwfimcnwvdax:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_mxbEwfimcnwvdax:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %ymm29,%ymm5,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bdGmCjdgnqqlltq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bdGmCjdgnqqlltq
+.L_small_initial_partial_block_bdGmCjdgnqqlltq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bdGmCjdgnqqlltq:
+
+ orq %r8,%r8
+ je .L_after_reduction_bdGmCjdgnqqlltq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bdGmCjdgnqqlltq:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_15_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_zgjqhDpFicvrFBk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_zgjqhDpFicvrFBk
+
+.L_16_blocks_overflow_zgjqhDpFicvrFBk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_zgjqhDpFicvrFBk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DiAChhgwveonFpA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DiAChhgwveonFpA
+.L_small_initial_partial_block_DiAChhgwveonFpA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DiAChhgwveonFpA:
+
+ orq %r8,%r8
+ je .L_after_reduction_DiAChhgwveonFpA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DiAChhgwveonFpA:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_16_gmjFjaoGnEhAquD:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_yyltxtltrzdqBtp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_yyltxtltrzdqBtp
+
+.L_16_blocks_overflow_yyltxtltrzdqBtp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_yyltxtltrzdqBtp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm17
+ vpshufb %zmm29,%zmm3,%zmm19
+ vpshufb %zmm29,%zmm4,%zmm20
+ vpshufb %zmm29,%zmm5,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_GsrEfbqkvAdwclh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GsrEfbqkvAdwclh:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_GsrEfbqkvAdwclh:
+ jmp .L_last_blocks_done_gmjFjaoGnEhAquD
+.L_last_num_blocks_is_0_gmjFjaoGnEhAquD:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_gmjFjaoGnEhAquD:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_eawnuBpGmxcBoDC
+
+.L_message_below_equal_16_blocks_eawnuBpGmxcBoDC:
+
+
+ movl %r8d,%r12d
+ addl $15,%r12d
+ shrl $4,%r12d
+ cmpq $8,%r12
+ je .L_small_initial_num_blocks_is_8_hbqugjruGfgczBp
+ jl .L_small_initial_num_blocks_is_7_1_hbqugjruGfgczBp
+
+
+ cmpq $12,%r12
+ je .L_small_initial_num_blocks_is_12_hbqugjruGfgczBp
+ jl .L_small_initial_num_blocks_is_11_9_hbqugjruGfgczBp
+
+
+ cmpq $16,%r12
+ je .L_small_initial_num_blocks_is_16_hbqugjruGfgczBp
+ cmpq $15,%r12
+ je .L_small_initial_num_blocks_is_15_hbqugjruGfgczBp
+ cmpq $14,%r12
+ je .L_small_initial_num_blocks_is_14_hbqugjruGfgczBp
+ jmp .L_small_initial_num_blocks_is_13_hbqugjruGfgczBp
+
+.L_small_initial_num_blocks_is_11_9_hbqugjruGfgczBp:
+
+ cmpq $11,%r12
+ je .L_small_initial_num_blocks_is_11_hbqugjruGfgczBp
+ cmpq $10,%r12
+ je .L_small_initial_num_blocks_is_10_hbqugjruGfgczBp
+ jmp .L_small_initial_num_blocks_is_9_hbqugjruGfgczBp
+
+.L_small_initial_num_blocks_is_7_1_hbqugjruGfgczBp:
+ cmpq $4,%r12
+ je .L_small_initial_num_blocks_is_4_hbqugjruGfgczBp
+ jl .L_small_initial_num_blocks_is_3_1_hbqugjruGfgczBp
+
+ cmpq $7,%r12
+ je .L_small_initial_num_blocks_is_7_hbqugjruGfgczBp
+ cmpq $6,%r12
+ je .L_small_initial_num_blocks_is_6_hbqugjruGfgczBp
+ jmp .L_small_initial_num_blocks_is_5_hbqugjruGfgczBp
+
+.L_small_initial_num_blocks_is_3_1_hbqugjruGfgczBp:
+
+ cmpq $3,%r12
+ je .L_small_initial_num_blocks_is_3_hbqugjruGfgczBp
+ cmpq $2,%r12
+ je .L_small_initial_num_blocks_is_2_hbqugjruGfgczBp
+
+
+
+
+
+.L_small_initial_num_blocks_is_1_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%xmm29
+ vpaddd ONE(%rip),%xmm2,%xmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vpshufb %xmm29,%xmm0,%xmm0
+ vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %xmm15,%xmm0,%xmm0
+ vpxorq %xmm6,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm0,%xmm6
+ vextracti32x4 $0,%zmm6,%xmm13
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_iFmDdgrbxxlznyd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_iFmDdgrbxxlznyd
+.L_small_initial_partial_block_iFmDdgrbxxlznyd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm13,%xmm14,%xmm14
+
+ jmp .L_after_reduction_iFmDdgrbxxlznyd
+.L_small_initial_compute_done_iFmDdgrbxxlznyd:
+.L_after_reduction_iFmDdgrbxxlznyd:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_2_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%ymm29
+ vshufi64x2 $0,%ymm2,%ymm2,%ymm0
+ vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vpshufb %ymm29,%ymm0,%ymm0
+ vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %ymm15,%ymm0,%ymm0
+ vpxorq %ymm6,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm0,%ymm6
+ vextracti32x4 $1,%zmm6,%xmm13
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EsCbfxikCrkamtE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EsCbfxikCrkamtE
+.L_small_initial_partial_block_EsCbfxikCrkamtE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EsCbfxikCrkamtE:
+
+ orq %r8,%r8
+ je .L_after_reduction_EsCbfxikCrkamtE
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_EsCbfxikCrkamtE:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_3_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vextracti32x4 $2,%zmm6,%xmm13
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tBEoFGBxxBysmml
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tBEoFGBxxBysmml
+.L_small_initial_partial_block_tBEoFGBxxBysmml:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tBEoFGBxxBysmml:
+
+ orq %r8,%r8
+ je .L_after_reduction_tBEoFGBxxBysmml
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_tBEoFGBxxBysmml:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_4_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vextracti32x4 $3,%zmm6,%xmm13
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dDrxftiGhnzzsCu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dDrxftiGhnzzsCu
+.L_small_initial_partial_block_dDrxftiGhnzzsCu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dDrxftiGhnzzsCu:
+
+ orq %r8,%r8
+ je .L_after_reduction_dDrxftiGhnzzsCu
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_dDrxftiGhnzzsCu:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_5_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %xmm7,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %xmm29,%xmm3,%xmm7
+ vextracti32x4 $0,%zmm7,%xmm13
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tgluGdkfFDhsixe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tgluGdkfFDhsixe
+.L_small_initial_partial_block_tgluGdkfFDhsixe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tgluGdkfFDhsixe:
+
+ orq %r8,%r8
+ je .L_after_reduction_tgluGdkfFDhsixe
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_tgluGdkfFDhsixe:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_6_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %ymm15,%ymm3,%ymm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %ymm7,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %ymm29,%ymm3,%ymm7
+ vextracti32x4 $1,%zmm7,%xmm13
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cDptiniAjeCvsaA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cDptiniAjeCvsaA
+.L_small_initial_partial_block_cDptiniAjeCvsaA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cDptiniAjeCvsaA:
+
+ orq %r8,%r8
+ je .L_after_reduction_cDptiniAjeCvsaA
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_cDptiniAjeCvsaA:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_7_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vextracti32x4 $2,%zmm7,%xmm13
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CkuomECEjoqBFyr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CkuomECEjoqBFyr
+.L_small_initial_partial_block_CkuomECEjoqBFyr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CkuomECEjoqBFyr:
+
+ orq %r8,%r8
+ je .L_after_reduction_CkuomECEjoqBFyr
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_CkuomECEjoqBFyr:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_8_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vextracti32x4 $3,%zmm7,%xmm13
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jetFsEuskrjwged
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jetFsEuskrjwged
+.L_small_initial_partial_block_jetFsEuskrjwged:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jetFsEuskrjwged:
+
+ orq %r8,%r8
+ je .L_after_reduction_jetFsEuskrjwged
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_jetFsEuskrjwged:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_9_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %xmm10,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %xmm29,%xmm4,%xmm10
+ vextracti32x4 $0,%zmm10,%xmm13
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_djtvlDCcmtClCqd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_djtvlDCcmtClCqd
+.L_small_initial_partial_block_djtvlDCcmtClCqd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_djtvlDCcmtClCqd:
+
+ orq %r8,%r8
+ je .L_after_reduction_djtvlDCcmtClCqd
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_djtvlDCcmtClCqd:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_10_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %ymm15,%ymm4,%ymm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %ymm10,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %ymm29,%ymm4,%ymm10
+ vextracti32x4 $1,%zmm10,%xmm13
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aptugwefEgbpisD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aptugwefEgbpisD
+.L_small_initial_partial_block_aptugwefEgbpisD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_aptugwefEgbpisD:
+
+ orq %r8,%r8
+ je .L_after_reduction_aptugwefEgbpisD
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_aptugwefEgbpisD:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_11_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vextracti32x4 $2,%zmm10,%xmm13
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BboqcvvuFoyragm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BboqcvvuFoyragm
+.L_small_initial_partial_block_BboqcvvuFoyragm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BboqcvvuFoyragm:
+
+ orq %r8,%r8
+ je .L_after_reduction_BboqcvvuFoyragm
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_BboqcvvuFoyragm:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_12_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vextracti32x4 $3,%zmm10,%xmm13
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yzpAqvxjrjtpbge
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yzpAqvxjrjtpbge
+.L_small_initial_partial_block_yzpAqvxjrjtpbge:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yzpAqvxjrjtpbge:
+
+ orq %r8,%r8
+ je .L_after_reduction_yzpAqvxjrjtpbge
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_yzpAqvxjrjtpbge:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_13_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %xmm11,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %xmm29,%xmm5,%xmm11
+ vextracti32x4 $0,%zmm11,%xmm13
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jjkyzlqDAbpoEdw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jjkyzlqDAbpoEdw
+.L_small_initial_partial_block_jjkyzlqDAbpoEdw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jjkyzlqDAbpoEdw:
+
+ orq %r8,%r8
+ je .L_after_reduction_jjkyzlqDAbpoEdw
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_jjkyzlqDAbpoEdw:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_14_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %ymm15,%ymm5,%ymm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %ymm11,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %ymm29,%ymm5,%ymm11
+ vextracti32x4 $1,%zmm11,%xmm13
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GlbsvkxecbisEEg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GlbsvkxecbisEEg
+.L_small_initial_partial_block_GlbsvkxecbisEEg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GlbsvkxecbisEEg:
+
+ orq %r8,%r8
+ je .L_after_reduction_GlbsvkxecbisEEg
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_GlbsvkxecbisEEg:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_15_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %zmm29,%zmm5,%zmm11
+ vextracti32x4 $2,%zmm11,%xmm13
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BFutaboihmcgqcA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BFutaboihmcgqcA
+.L_small_initial_partial_block_BFutaboihmcgqcA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BFutaboihmcgqcA:
+
+ orq %r8,%r8
+ je .L_after_reduction_BFutaboihmcgqcA
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_BFutaboihmcgqcA:
+ jmp .L_small_initial_blocks_encrypted_hbqugjruGfgczBp
+.L_small_initial_num_blocks_is_16_hbqugjruGfgczBp:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm0,%zmm6
+ vpshufb %zmm29,%zmm3,%zmm7
+ vpshufb %zmm29,%zmm4,%zmm10
+ vpshufb %zmm29,%zmm5,%zmm11
+ vextracti32x4 $3,%zmm11,%xmm13
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_AxxoDBglqjscnzw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AxxoDBglqjscnzw:
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_AxxoDBglqjscnzw:
+.L_small_initial_blocks_encrypted_hbqugjruGfgczBp:
+.L_ghash_done_eawnuBpGmxcBoDC:
+ vmovdqu64 %xmm2,0(%rsi)
+ vmovdqu64 %xmm14,64(%rsi)
+.L_enc_dec_done_eawnuBpGmxcBoDC:
+ jmp .Lexit_gcm_encrypt
+.Lexit_gcm_encrypt:
+ cmpq $256,%r8
+ jbe .Lskip_hkeys_cleanup_FwyhaGceDljchpo
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa64 %zmm0,0(%rsp)
+ vmovdqa64 %zmm0,64(%rsp)
+ vmovdqa64 %zmm0,128(%rsp)
+ vmovdqa64 %zmm0,192(%rsp)
+ vmovdqa64 %zmm0,256(%rsp)
+ vmovdqa64 %zmm0,320(%rsp)
+ vmovdqa64 %zmm0,384(%rsp)
+ vmovdqa64 %zmm0,448(%rsp)
+ vmovdqa64 %zmm0,512(%rsp)
+ vmovdqa64 %zmm0,576(%rsp)
+ vmovdqa64 %zmm0,640(%rsp)
+ vmovdqa64 %zmm0,704(%rsp)
+.Lskip_hkeys_cleanup_FwyhaGceDljchpo:
+ vzeroupper
+ leaq (%rbp),%rsp
+.cfi_def_cfa_register %rsp
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ .byte 0xf3,0xc3
+.Lencrypt_seh_end:
+.cfi_endproc
+.size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
+.globl ossl_aes_gcm_decrypt_avx512
+.type ossl_aes_gcm_decrypt_avx512,@function
+.align 32
+ossl_aes_gcm_decrypt_avx512:
+.cfi_startproc
+.Ldecrypt_seh_begin:
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+.Ldecrypt_seh_push_rbx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+.Ldecrypt_seh_push_rbp:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.Ldecrypt_seh_push_r12:
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.Ldecrypt_seh_push_r13:
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.Ldecrypt_seh_push_r14:
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Ldecrypt_seh_push_r15:
+
+
+
+
+
+
+
+
+
+
+ leaq 0(%rsp),%rbp
+.cfi_def_cfa_register %rbp
+.Ldecrypt_seh_setfp:
+
+.Ldecrypt_seh_prolog_end:
+ subq $1588,%rsp
+ andq $(-64),%rsp
+
+
+ movl 240(%rdi),%eax
+ cmpl $9,%eax
+ je .Laes_gcm_decrypt_128_avx512
+ cmpl $11,%eax
+ je .Laes_gcm_decrypt_192_avx512
+ cmpl $13,%eax
+ je .Laes_gcm_decrypt_256_avx512
+ xorl %eax,%eax
+ jmp .Lexit_gcm_decrypt
+.align 32
+.Laes_gcm_decrypt_128_avx512:
+ orq %r8,%r8
+ je .L_enc_dec_done_brADimEeCnCcDmv
+ xorq %r14,%r14
+ vmovdqu64 64(%rsi),%xmm14
+
+ movq (%rdx),%r11
+ orq %r11,%r11
+ je .L_partial_block_done_bsCeAyqpAAwsgvv
+ movl $16,%r10d
+ leaq byte_len_to_mask_table(%rip),%r12
+ cmpq %r10,%r8
+ cmovcq %r8,%r10
+ kmovw (%r12,%r10,2),%k1
+ vmovdqu8 (%rcx),%xmm0{%k1}{z}
+
+ vmovdqu64 16(%rsi),%xmm3
+ vmovdqu64 336(%rsi),%xmm4
+
+
+
+ leaq SHIFT_MASK(%rip),%r12
+ addq %r11,%r12
+ vmovdqu64 (%r12),%xmm5
+ vpshufb %xmm5,%xmm3,%xmm3
+
+ vmovdqa64 %xmm0,%xmm6
+ vpxorq %xmm0,%xmm3,%xmm3
+
+
+ leaq (%r8,%r11,1),%r13
+ subq $16,%r13
+ jge .L_no_extra_mask_bsCeAyqpAAwsgvv
+ subq %r13,%r12
+.L_no_extra_mask_bsCeAyqpAAwsgvv:
+
+
+
+ vmovdqu64 16(%r12),%xmm0
+ vpand %xmm0,%xmm3,%xmm3
+ vpand %xmm0,%xmm6,%xmm6
+ vpshufb SHUF_MASK(%rip),%xmm6,%xmm6
+ vpshufb %xmm5,%xmm6,%xmm6
+ vpxorq %xmm6,%xmm14,%xmm14
+ cmpq $0,%r13
+ jl .L_partial_incomplete_bsCeAyqpAAwsgvv
+
+ vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
+ vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
+ vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
+ vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm14,%xmm14
+
+ vpsrldq $8,%xmm14,%xmm11
+ vpslldq $8,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm7,%xmm7
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm11
+
+ vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
+ vpslldq $8,%xmm10,%xmm10
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
+ vpsrldq $4,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+
+ vpternlogq $0x96,%xmm10,%xmm7,%xmm14
+
+ movq $0,(%rdx)
+
+ movq %r11,%r12
+ movq $16,%r11
+ subq %r12,%r11
+ jmp .L_enc_dec_done_bsCeAyqpAAwsgvv
+
+.L_partial_incomplete_bsCeAyqpAAwsgvv:
+ addq %r8,(%rdx)
+ movq %r8,%r11
+
+.L_enc_dec_done_bsCeAyqpAAwsgvv:
+
+
+ leaq byte_len_to_mask_table(%rip),%r12
+ kmovw (%r12,%r11,2),%k1
+ vmovdqu64 %xmm14,64(%rsi)
+ movq %r9,%r12
+ vmovdqu8 %xmm3,(%r12){%k1}
+.L_partial_block_done_bsCeAyqpAAwsgvv:
+ vmovdqu64 0(%rsi),%xmm2
+ subq %r11,%r8
+ je .L_enc_dec_done_brADimEeCnCcDmv
+ cmpq $256,%r8
+ jbe .L_message_below_equal_16_blocks_brADimEeCnCcDmv
+
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vmovdqa64 ddq_addbe_4444(%rip),%zmm27
+ vmovdqa64 ddq_addbe_1234(%rip),%zmm28
+
+
+
+
+
+
+ vmovd %xmm2,%r15d
+ andl $255,%r15d
+
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpshufb %zmm29,%zmm2,%zmm2
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_eghvmbEDtcnDnAu
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_eghvmbEDtcnDnAu
+.L_next_16_overflow_eghvmbEDtcnDnAu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_eghvmbEDtcnDnAu:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm0
+ vmovdqu8 64(%rcx,%r11,1),%zmm3
+ vmovdqu8 128(%rcx,%r11,1),%zmm4
+ vmovdqu8 192(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,0(%r10,%r11,1)
+ vmovdqu8 %zmm10,64(%r10,%r11,1)
+ vmovdqu8 %zmm11,128(%r10,%r11,1)
+ vmovdqu8 %zmm12,192(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm0,%zmm7
+ vpshufb %zmm29,%zmm3,%zmm10
+ vpshufb %zmm29,%zmm4,%zmm11
+ vpshufb %zmm29,%zmm5,%zmm12
+ vmovdqa64 %zmm7,768(%rsp)
+ vmovdqa64 %zmm10,832(%rsp)
+ vmovdqa64 %zmm11,896(%rsp)
+ vmovdqa64 %zmm12,960(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_plwezswvdFDdDBp
+
+ vmovdqu64 288(%rsi),%zmm0
+ vmovdqu64 %zmm0,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm3
+ vmovdqu64 %zmm3,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 160(%rsi),%zmm4
+ vmovdqu64 %zmm4,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm5
+ vmovdqu64 %zmm5,512(%rsp)
+.L_skip_hkeys_precomputation_plwezswvdFDdDBp:
+ cmpq $512,%r8
+ jb .L_message_below_32_blocks_brADimEeCnCcDmv
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_yieysttglezqCBf
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_yieysttglezqCBf
+.L_next_16_overflow_yieysttglezqCBf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_yieysttglezqCBf:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm0
+ vmovdqu8 320(%rcx,%r11,1),%zmm3
+ vmovdqu8 384(%rcx,%r11,1),%zmm4
+ vmovdqu8 448(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,256(%r10,%r11,1)
+ vmovdqu8 %zmm10,320(%r10,%r11,1)
+ vmovdqu8 %zmm11,384(%r10,%r11,1)
+ vmovdqu8 %zmm12,448(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm0,%zmm7
+ vpshufb %zmm29,%zmm3,%zmm10
+ vpshufb %zmm29,%zmm4,%zmm11
+ vpshufb %zmm29,%zmm5,%zmm12
+ vmovdqa64 %zmm7,1024(%rsp)
+ vmovdqa64 %zmm10,1088(%rsp)
+ vmovdqa64 %zmm11,1152(%rsp)
+ vmovdqa64 %zmm12,1216(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_cqhgcscctsdbGkB
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,0(%rsp)
+.L_skip_hkeys_precomputation_cqhgcscctsdbGkB:
+ movq $1,%r14
+ addq $512,%r11
+ subq $512,%r8
+
+ cmpq $768,%r8
+ jb .L_no_more_big_nblocks_brADimEeCnCcDmv
+.L_encrypt_big_nblocks_brADimEeCnCcDmv:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_jeuDwtvAfvGmCgt
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_jeuDwtvAfvGmCgt
+.L_16_blocks_overflow_jeuDwtvAfvGmCgt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_jeuDwtvAfvGmCgt:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_hGznvbxlbulnqGf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_hGznvbxlbulnqGf
+.L_16_blocks_overflow_hGznvbxlbulnqGf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_hGznvbxlbulnqGf:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_hikcfykasilniFs
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_hikcfykasilniFs
+.L_16_blocks_overflow_hikcfykasilniFs:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_hikcfykasilniFs:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 512(%rcx,%r11,1),%zmm17
+ vmovdqu8 576(%rcx,%r11,1),%zmm19
+ vmovdqu8 640(%rcx,%r11,1),%zmm20
+ vmovdqu8 704(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+
+
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpternlogq $0x96,%zmm15,%zmm12,%zmm6
+ vpxorq %zmm24,%zmm6,%zmm6
+ vpternlogq $0x96,%zmm10,%zmm13,%zmm7
+ vpxorq %zmm25,%zmm7,%zmm7
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vextracti64x4 $1,%zmm6,%ymm12
+ vpxorq %ymm12,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm12
+ vpxorq %xmm12,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm6
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,512(%r10,%r11,1)
+ vmovdqu8 %zmm3,576(%r10,%r11,1)
+ vmovdqu8 %zmm4,640(%r10,%r11,1)
+ vmovdqu8 %zmm5,704(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1024(%rsp)
+ vmovdqa64 %zmm3,1088(%rsp)
+ vmovdqa64 %zmm4,1152(%rsp)
+ vmovdqa64 %zmm5,1216(%rsp)
+ vmovdqa64 %zmm6,%zmm14
+
+ addq $768,%r11
+ subq $768,%r8
+ cmpq $768,%r8
+ jae .L_encrypt_big_nblocks_brADimEeCnCcDmv
+
+.L_no_more_big_nblocks_brADimEeCnCcDmv:
+
+ cmpq $512,%r8
+ jae .L_encrypt_32_blocks_brADimEeCnCcDmv
+
+ cmpq $256,%r8
+ jae .L_encrypt_16_blocks_brADimEeCnCcDmv
+.L_encrypt_0_blocks_ghash_32_brADimEeCnCcDmv:
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $256,%ebx
+ subl %r10d,%ebx
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ addl $256,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_xyDAiCmaAhzpydl
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_xyDAiCmaAhzpydl
+ jb .L_last_num_blocks_is_7_1_xyDAiCmaAhzpydl
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_xyDAiCmaAhzpydl
+ jb .L_last_num_blocks_is_11_9_xyDAiCmaAhzpydl
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_xyDAiCmaAhzpydl
+ ja .L_last_num_blocks_is_16_xyDAiCmaAhzpydl
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_xyDAiCmaAhzpydl
+ jmp .L_last_num_blocks_is_13_xyDAiCmaAhzpydl
+
+.L_last_num_blocks_is_11_9_xyDAiCmaAhzpydl:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_xyDAiCmaAhzpydl
+ ja .L_last_num_blocks_is_11_xyDAiCmaAhzpydl
+ jmp .L_last_num_blocks_is_9_xyDAiCmaAhzpydl
+
+.L_last_num_blocks_is_7_1_xyDAiCmaAhzpydl:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_xyDAiCmaAhzpydl
+ jb .L_last_num_blocks_is_3_1_xyDAiCmaAhzpydl
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_xyDAiCmaAhzpydl
+ je .L_last_num_blocks_is_6_xyDAiCmaAhzpydl
+ jmp .L_last_num_blocks_is_5_xyDAiCmaAhzpydl
+
+.L_last_num_blocks_is_3_1_xyDAiCmaAhzpydl:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_xyDAiCmaAhzpydl
+ je .L_last_num_blocks_is_2_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_1_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_fyDzBrphsGjubgG
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_fyDzBrphsGjubgG
+
+.L_16_blocks_overflow_fyDzBrphsGjubgG:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_fyDzBrphsGjubgG:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vtxqFwAgrdnllzF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vtxqFwAgrdnllzF
+.L_small_initial_partial_block_vtxqFwAgrdnllzF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_vtxqFwAgrdnllzF
+.L_small_initial_compute_done_vtxqFwAgrdnllzF:
+.L_after_reduction_vtxqFwAgrdnllzF:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_2_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_BugDrclgtxGysBC
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_BugDrclgtxGysBC
+
+.L_16_blocks_overflow_BugDrclgtxGysBC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_BugDrclgtxGysBC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dwpAvxknFwdDaDi
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dwpAvxknFwdDaDi
+.L_small_initial_partial_block_dwpAvxknFwdDaDi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dwpAvxknFwdDaDi:
+
+ orq %r8,%r8
+ je .L_after_reduction_dwpAvxknFwdDaDi
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dwpAvxknFwdDaDi:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_3_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_xznshBaaivCChih
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_xznshBaaivCChih
+
+.L_16_blocks_overflow_xznshBaaivCChih:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_xznshBaaivCChih:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ltvboeEneeszwsu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ltvboeEneeszwsu
+.L_small_initial_partial_block_ltvboeEneeszwsu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ltvboeEneeszwsu:
+
+ orq %r8,%r8
+ je .L_after_reduction_ltvboeEneeszwsu
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ltvboeEneeszwsu:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_4_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_ofErewxunpEhuze
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_ofErewxunpEhuze
+
+.L_16_blocks_overflow_ofErewxunpEhuze:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_ofErewxunpEhuze:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mdwrrkghGswontC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mdwrrkghGswontC
+.L_small_initial_partial_block_mdwrrkghGswontC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mdwrrkghGswontC:
+
+ orq %r8,%r8
+ je .L_after_reduction_mdwrrkghGswontC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_mdwrrkghGswontC:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_5_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_vlFDjDvkCmipDjj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_vlFDjDvkCmipDjj
+
+.L_16_blocks_overflow_vlFDjDvkCmipDjj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_vlFDjDvkCmipDjj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vyyfueCAnBpziso
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vyyfueCAnBpziso
+.L_small_initial_partial_block_vyyfueCAnBpziso:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vyyfueCAnBpziso:
+
+ orq %r8,%r8
+ je .L_after_reduction_vyyfueCAnBpziso
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vyyfueCAnBpziso:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_6_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_swonEtcpnChuzwe
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_swonEtcpnChuzwe
+
+.L_16_blocks_overflow_swonEtcpnChuzwe:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_swonEtcpnChuzwe:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aEryhnaxCjcvalc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aEryhnaxCjcvalc
+.L_small_initial_partial_block_aEryhnaxCjcvalc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_aEryhnaxCjcvalc:
+
+ orq %r8,%r8
+ je .L_after_reduction_aEryhnaxCjcvalc
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_aEryhnaxCjcvalc:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_7_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_EGhejzspzceoDrz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_EGhejzspzceoDrz
+
+.L_16_blocks_overflow_EGhejzspzceoDrz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_EGhejzspzceoDrz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lcrbhrsFEemAseF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lcrbhrsFEemAseF
+.L_small_initial_partial_block_lcrbhrsFEemAseF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lcrbhrsFEemAseF:
+
+ orq %r8,%r8
+ je .L_after_reduction_lcrbhrsFEemAseF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lcrbhrsFEemAseF:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_8_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_bwyfeoBaojvbAgd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_bwyfeoBaojvbAgd
+
+.L_16_blocks_overflow_bwyfeoBaojvbAgd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_bwyfeoBaojvbAgd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_osycqepyfDlatEs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_osycqepyfDlatEs
+.L_small_initial_partial_block_osycqepyfDlatEs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_osycqepyfDlatEs:
+
+ orq %r8,%r8
+ je .L_after_reduction_osycqepyfDlatEs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_osycqepyfDlatEs:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_9_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_BaoGkpEpCdeyrev
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_BaoGkpEpCdeyrev
+
+.L_16_blocks_overflow_BaoGkpEpCdeyrev:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_BaoGkpEpCdeyrev:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ilsvshcinsdmttt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ilsvshcinsdmttt
+.L_small_initial_partial_block_ilsvshcinsdmttt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ilsvshcinsdmttt:
+
+ orq %r8,%r8
+ je .L_after_reduction_ilsvshcinsdmttt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ilsvshcinsdmttt:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_10_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_sAtxBaaxwaffire
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_sAtxBaaxwaffire
+
+.L_16_blocks_overflow_sAtxBaaxwaffire:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_sAtxBaaxwaffire:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mAgwqklangGkxiD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mAgwqklangGkxiD
+.L_small_initial_partial_block_mAgwqklangGkxiD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mAgwqklangGkxiD:
+
+ orq %r8,%r8
+ je .L_after_reduction_mAgwqklangGkxiD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_mAgwqklangGkxiD:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_11_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_ditvbyzmFxiaFex
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_ditvbyzmFxiaFex
+
+.L_16_blocks_overflow_ditvbyzmFxiaFex:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_ditvbyzmFxiaFex:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hnpDdEkCCcoeFCy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hnpDdEkCCcoeFCy
+.L_small_initial_partial_block_hnpDdEkCCcoeFCy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hnpDdEkCCcoeFCy:
+
+ orq %r8,%r8
+ je .L_after_reduction_hnpDdEkCCcoeFCy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hnpDdEkCCcoeFCy:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_12_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_iDaEpwpdhbvwFws
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_iDaEpwpdhbvwFws
+
+.L_16_blocks_overflow_iDaEpwpdhbvwFws:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_iDaEpwpdhbvwFws:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vFCCocfxfdGyktw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vFCCocfxfdGyktw
+.L_small_initial_partial_block_vFCCocfxfdGyktw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vFCCocfxfdGyktw:
+
+ orq %r8,%r8
+ je .L_after_reduction_vFCCocfxfdGyktw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vFCCocfxfdGyktw:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_13_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_ossjtlatrhiigng
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_ossjtlatrhiigng
+
+.L_16_blocks_overflow_ossjtlatrhiigng:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_ossjtlatrhiigng:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CiuBkutmcuwgEdD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CiuBkutmcuwgEdD
+.L_small_initial_partial_block_CiuBkutmcuwgEdD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CiuBkutmcuwgEdD:
+
+ orq %r8,%r8
+ je .L_after_reduction_CiuBkutmcuwgEdD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CiuBkutmcuwgEdD:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_14_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_vocABmmphunBotn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_vocABmmphunBotn
+
+.L_16_blocks_overflow_vocABmmphunBotn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_vocABmmphunBotn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xoGwditlthtdCzd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xoGwditlthtdCzd
+.L_small_initial_partial_block_xoGwditlthtdCzd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xoGwditlthtdCzd:
+
+ orq %r8,%r8
+ je .L_after_reduction_xoGwditlthtdCzd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xoGwditlthtdCzd:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_15_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_jbcAwazvdrBjhzu
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_jbcAwazvdrBjhzu
+
+.L_16_blocks_overflow_jbcAwazvdrBjhzu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_jbcAwazvdrBjhzu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_eohjglCqsfjlesq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_eohjglCqsfjlesq
+.L_small_initial_partial_block_eohjglCqsfjlesq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_eohjglCqsfjlesq:
+
+ orq %r8,%r8
+ je .L_after_reduction_eohjglCqsfjlesq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_eohjglCqsfjlesq:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_16_xyDAiCmaAhzpydl:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_uatdhlpChpnBofk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_uatdhlpChpnBofk
+
+.L_16_blocks_overflow_uatdhlpChpnBofk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_uatdhlpChpnBofk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_uvEqevkuejAoeFv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uvEqevkuejAoeFv:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uvEqevkuejAoeFv:
+ jmp .L_last_blocks_done_xyDAiCmaAhzpydl
+.L_last_num_blocks_is_0_xyDAiCmaAhzpydl:
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_xyDAiCmaAhzpydl:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_brADimEeCnCcDmv
+.L_encrypt_32_blocks_brADimEeCnCcDmv:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_brlCzGBjhaqyEcd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_brlCzGBjhaqyEcd
+.L_16_blocks_overflow_brlCzGBjhaqyEcd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_brlCzGBjhaqyEcd:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_pchieDggcEipdhz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_pchieDggcEipdhz
+.L_16_blocks_overflow_pchieDggcEipdhz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_pchieDggcEipdhz:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+ subq $512,%r8
+ addq $512,%r11
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_digsBljoDvGeopi
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_digsBljoDvGeopi
+ jb .L_last_num_blocks_is_7_1_digsBljoDvGeopi
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_digsBljoDvGeopi
+ jb .L_last_num_blocks_is_11_9_digsBljoDvGeopi
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_digsBljoDvGeopi
+ ja .L_last_num_blocks_is_16_digsBljoDvGeopi
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_digsBljoDvGeopi
+ jmp .L_last_num_blocks_is_13_digsBljoDvGeopi
+
+.L_last_num_blocks_is_11_9_digsBljoDvGeopi:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_digsBljoDvGeopi
+ ja .L_last_num_blocks_is_11_digsBljoDvGeopi
+ jmp .L_last_num_blocks_is_9_digsBljoDvGeopi
+
+.L_last_num_blocks_is_7_1_digsBljoDvGeopi:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_digsBljoDvGeopi
+ jb .L_last_num_blocks_is_3_1_digsBljoDvGeopi
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_digsBljoDvGeopi
+ je .L_last_num_blocks_is_6_digsBljoDvGeopi
+ jmp .L_last_num_blocks_is_5_digsBljoDvGeopi
+
+.L_last_num_blocks_is_3_1_digsBljoDvGeopi:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_digsBljoDvGeopi
+ je .L_last_num_blocks_is_2_digsBljoDvGeopi
+.L_last_num_blocks_is_1_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_eopubcfobBxhpzt
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_eopubcfobBxhpzt
+
+.L_16_blocks_overflow_eopubcfobBxhpzt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_eopubcfobBxhpzt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GethbnvGqcjphdB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GethbnvGqcjphdB
+.L_small_initial_partial_block_GethbnvGqcjphdB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_GethbnvGqcjphdB
+.L_small_initial_compute_done_GethbnvGqcjphdB:
+.L_after_reduction_GethbnvGqcjphdB:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_2_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_tpsnzcptGBjneak
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_tpsnzcptGBjneak
+
+.L_16_blocks_overflow_tpsnzcptGBjneak:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_tpsnzcptGBjneak:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xzAlvFvGbtFmqjz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xzAlvFvGbtFmqjz
+.L_small_initial_partial_block_xzAlvFvGbtFmqjz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xzAlvFvGbtFmqjz:
+
+ orq %r8,%r8
+ je .L_after_reduction_xzAlvFvGbtFmqjz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xzAlvFvGbtFmqjz:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_3_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_lirgnnkvzmitoxw
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_lirgnnkvzmitoxw
+
+.L_16_blocks_overflow_lirgnnkvzmitoxw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_lirgnnkvzmitoxw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ovClAwtFzFgwrxE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ovClAwtFzFgwrxE
+.L_small_initial_partial_block_ovClAwtFzFgwrxE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ovClAwtFzFgwrxE:
+
+ orq %r8,%r8
+ je .L_after_reduction_ovClAwtFzFgwrxE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ovClAwtFzFgwrxE:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_4_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_xgCtemAejdionch
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_xgCtemAejdionch
+
+.L_16_blocks_overflow_xgCtemAejdionch:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_xgCtemAejdionch:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_iEyBjAGEhdmCFpz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_iEyBjAGEhdmCFpz
+.L_small_initial_partial_block_iEyBjAGEhdmCFpz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_iEyBjAGEhdmCFpz:
+
+ orq %r8,%r8
+ je .L_after_reduction_iEyBjAGEhdmCFpz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_iEyBjAGEhdmCFpz:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_5_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_eojywxfxbxGnElA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_eojywxfxbxGnElA
+
+.L_16_blocks_overflow_eojywxfxbxGnElA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_eojywxfxbxGnElA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xzyrfzavvdvxobt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xzyrfzavvdvxobt
+.L_small_initial_partial_block_xzyrfzavvdvxobt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xzyrfzavvdvxobt:
+
+ orq %r8,%r8
+ je .L_after_reduction_xzyrfzavvdvxobt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xzyrfzavvdvxobt:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_6_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_fefwvFrCitcygrh
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_fefwvFrCitcygrh
+
+.L_16_blocks_overflow_fefwvFrCitcygrh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_fefwvFrCitcygrh:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EGwsgDahgpEisFa
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EGwsgDahgpEisFa
+.L_small_initial_partial_block_EGwsgDahgpEisFa:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EGwsgDahgpEisFa:
+
+ orq %r8,%r8
+ je .L_after_reduction_EGwsgDahgpEisFa
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_EGwsgDahgpEisFa:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_7_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_GiAftkxuDrwByoy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_GiAftkxuDrwByoy
+
+.L_16_blocks_overflow_GiAftkxuDrwByoy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_GiAftkxuDrwByoy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pvtnwvrCesGFzzt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pvtnwvrCesGFzzt
+.L_small_initial_partial_block_pvtnwvrCesGFzzt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pvtnwvrCesGFzzt:
+
+ orq %r8,%r8
+ je .L_after_reduction_pvtnwvrCesGFzzt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pvtnwvrCesGFzzt:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_8_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_fdotfBFcguDtbBo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_fdotfBFcguDtbBo
+
+.L_16_blocks_overflow_fdotfBFcguDtbBo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_fdotfBFcguDtbBo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wvodhAGehoxjCmp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wvodhAGehoxjCmp
+.L_small_initial_partial_block_wvodhAGehoxjCmp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wvodhAGehoxjCmp:
+
+ orq %r8,%r8
+ je .L_after_reduction_wvodhAGehoxjCmp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wvodhAGehoxjCmp:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_9_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_GcksGDvymbkGaeh
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_GcksGDvymbkGaeh
+
+.L_16_blocks_overflow_GcksGDvymbkGaeh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_GcksGDvymbkGaeh:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uqlihfyhxyhihvk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uqlihfyhxyhihvk
+.L_small_initial_partial_block_uqlihfyhxyhihvk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uqlihfyhxyhihvk:
+
+ orq %r8,%r8
+ je .L_after_reduction_uqlihfyhxyhihvk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uqlihfyhxyhihvk:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_10_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_bjDavzoezpzksBl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_bjDavzoezpzksBl
+
+.L_16_blocks_overflow_bjDavzoezpzksBl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_bjDavzoezpzksBl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_thhwkdBkbzuszkb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_thhwkdBkbzuszkb
+.L_small_initial_partial_block_thhwkdBkbzuszkb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_thhwkdBkbzuszkb:
+
+ orq %r8,%r8
+ je .L_after_reduction_thhwkdBkbzuszkb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_thhwkdBkbzuszkb:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_11_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_epoBmnewvcDxoga
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_epoBmnewvcDxoga
+
+.L_16_blocks_overflow_epoBmnewvcDxoga:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_epoBmnewvcDxoga:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xCrDaEDvhzCAvdw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xCrDaEDvhzCAvdw
+.L_small_initial_partial_block_xCrDaEDvhzCAvdw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xCrDaEDvhzCAvdw:
+
+ orq %r8,%r8
+ je .L_after_reduction_xCrDaEDvhzCAvdw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xCrDaEDvhzCAvdw:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_12_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_jDebikuAmaaarvn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_jDebikuAmaaarvn
+
+.L_16_blocks_overflow_jDebikuAmaaarvn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_jDebikuAmaaarvn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ynohxakFGzjuDGi
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ynohxakFGzjuDGi
+.L_small_initial_partial_block_ynohxakFGzjuDGi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ynohxakFGzjuDGi:
+
+ orq %r8,%r8
+ je .L_after_reduction_ynohxakFGzjuDGi
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ynohxakFGzjuDGi:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_13_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_hshekyDxCginrlC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_hshekyDxCginrlC
+
+.L_16_blocks_overflow_hshekyDxCginrlC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_hshekyDxCginrlC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_httDwjAaGCslaiE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_httDwjAaGCslaiE
+.L_small_initial_partial_block_httDwjAaGCslaiE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_httDwjAaGCslaiE:
+
+ orq %r8,%r8
+ je .L_after_reduction_httDwjAaGCslaiE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_httDwjAaGCslaiE:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_14_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_DrtmyDmpgCneBsy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_DrtmyDmpgCneBsy
+
+.L_16_blocks_overflow_DrtmyDmpgCneBsy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_DrtmyDmpgCneBsy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fAmeqrcqmahfygz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fAmeqrcqmahfygz
+.L_small_initial_partial_block_fAmeqrcqmahfygz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fAmeqrcqmahfygz:
+
+ orq %r8,%r8
+ je .L_after_reduction_fAmeqrcqmahfygz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fAmeqrcqmahfygz:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_15_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_jakbeEuDkermeem
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_jakbeEuDkermeem
+
+.L_16_blocks_overflow_jakbeEuDkermeem:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_jakbeEuDkermeem:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_czuljoFmwduytgq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_czuljoFmwduytgq
+.L_small_initial_partial_block_czuljoFmwduytgq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_czuljoFmwduytgq:
+
+ orq %r8,%r8
+ je .L_after_reduction_czuljoFmwduytgq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_czuljoFmwduytgq:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_16_digsBljoDvGeopi:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_pFvBGotBaidmClB
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_pFvBGotBaidmClB
+
+.L_16_blocks_overflow_pFvBGotBaidmClB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_pFvBGotBaidmClB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_rlrrckDhqtmvgrG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rlrrckDhqtmvgrG:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rlrrckDhqtmvgrG:
+ jmp .L_last_blocks_done_digsBljoDvGeopi
+.L_last_num_blocks_is_0_digsBljoDvGeopi:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_digsBljoDvGeopi:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_brADimEeCnCcDmv
+.L_encrypt_16_blocks_brADimEeCnCcDmv:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_mBiujfnyqjDacBo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_mBiujfnyqjDacBo
+.L_16_blocks_overflow_mBiujfnyqjDacBo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_mBiujfnyqjDacBo:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 256(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 320(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 384(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 448(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_hpinkedxAsgwrDG
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_hpinkedxAsgwrDG
+ jb .L_last_num_blocks_is_7_1_hpinkedxAsgwrDG
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_hpinkedxAsgwrDG
+ jb .L_last_num_blocks_is_11_9_hpinkedxAsgwrDG
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_hpinkedxAsgwrDG
+ ja .L_last_num_blocks_is_16_hpinkedxAsgwrDG
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_hpinkedxAsgwrDG
+ jmp .L_last_num_blocks_is_13_hpinkedxAsgwrDG
+
+.L_last_num_blocks_is_11_9_hpinkedxAsgwrDG:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_hpinkedxAsgwrDG
+ ja .L_last_num_blocks_is_11_hpinkedxAsgwrDG
+ jmp .L_last_num_blocks_is_9_hpinkedxAsgwrDG
+
+.L_last_num_blocks_is_7_1_hpinkedxAsgwrDG:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_hpinkedxAsgwrDG
+ jb .L_last_num_blocks_is_3_1_hpinkedxAsgwrDG
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_hpinkedxAsgwrDG
+ je .L_last_num_blocks_is_6_hpinkedxAsgwrDG
+ jmp .L_last_num_blocks_is_5_hpinkedxAsgwrDG
+
+.L_last_num_blocks_is_3_1_hpinkedxAsgwrDG:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_hpinkedxAsgwrDG
+ je .L_last_num_blocks_is_2_hpinkedxAsgwrDG
+.L_last_num_blocks_is_1_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_fBBmqqamxsbkcrt
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_fBBmqqamxsbkcrt
+
+.L_16_blocks_overflow_fBBmqqamxsbkcrt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_fBBmqqamxsbkcrt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %xmm31,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lrfgmFpfobGvwfj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lrfgmFpfobGvwfj
+.L_small_initial_partial_block_lrfgmFpfobGvwfj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_lrfgmFpfobGvwfj
+.L_small_initial_compute_done_lrfgmFpfobGvwfj:
+.L_after_reduction_lrfgmFpfobGvwfj:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_2_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_xDanrAoaAcACiFw
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_xDanrAoaAcACiFw
+
+.L_16_blocks_overflow_xDanrAoaAcACiFw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_xDanrAoaAcACiFw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %ymm31,%ymm0,%ymm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rgsstcnEqnxrxBs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rgsstcnEqnxrxBs
+.L_small_initial_partial_block_rgsstcnEqnxrxBs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rgsstcnEqnxrxBs:
+
+ orq %r8,%r8
+ je .L_after_reduction_rgsstcnEqnxrxBs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rgsstcnEqnxrxBs:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_3_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_lrqqcheobutysur
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_lrqqcheobutysur
+
+.L_16_blocks_overflow_lrqqcheobutysur:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_lrqqcheobutysur:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xejmrnqBpubjbjg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xejmrnqBpubjbjg
+.L_small_initial_partial_block_xejmrnqBpubjbjg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xejmrnqBpubjbjg:
+
+ orq %r8,%r8
+ je .L_after_reduction_xejmrnqBpubjbjg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xejmrnqBpubjbjg:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_4_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_gjemvxDziwfmcyi
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_gjemvxDziwfmcyi
+
+.L_16_blocks_overflow_gjemvxDziwfmcyi:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_gjemvxDziwfmcyi:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fCcphAbbvbdCpEo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fCcphAbbvbdCpEo
+.L_small_initial_partial_block_fCcphAbbvbdCpEo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fCcphAbbvbdCpEo:
+
+ orq %r8,%r8
+ je .L_after_reduction_fCcphAbbvbdCpEo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fCcphAbbvbdCpEo:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_5_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_ftkjlfgrvFmBAqj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_ftkjlfgrvFmBAqj
+
+.L_16_blocks_overflow_ftkjlfgrvFmBAqj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_ftkjlfgrvFmBAqj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GcmEpgzDnksqGvv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GcmEpgzDnksqGvv
+.L_small_initial_partial_block_GcmEpgzDnksqGvv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GcmEpgzDnksqGvv:
+
+ orq %r8,%r8
+ je .L_after_reduction_GcmEpgzDnksqGvv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_GcmEpgzDnksqGvv:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_6_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_wcFtAwbEGtnhhov
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_wcFtAwbEGtnhhov
+
+.L_16_blocks_overflow_wcFtAwbEGtnhhov:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_wcFtAwbEGtnhhov:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ljhumqErtfjivdq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ljhumqErtfjivdq
+.L_small_initial_partial_block_ljhumqErtfjivdq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ljhumqErtfjivdq:
+
+ orq %r8,%r8
+ je .L_after_reduction_ljhumqErtfjivdq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ljhumqErtfjivdq:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_7_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_xipoAqDkcCyBFhx
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_xipoAqDkcCyBFhx
+
+.L_16_blocks_overflow_xipoAqDkcCyBFhx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_xipoAqDkcCyBFhx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jeohFFoGiiGxanC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jeohFFoGiiGxanC
+.L_small_initial_partial_block_jeohFFoGiiGxanC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jeohFFoGiiGxanC:
+
+ orq %r8,%r8
+ je .L_after_reduction_jeohFFoGiiGxanC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jeohFFoGiiGxanC:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_8_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_CxhquljwEiGywcd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_CxhquljwEiGywcd
+
+.L_16_blocks_overflow_CxhquljwEiGywcd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_CxhquljwEiGywcd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_eqywyFyndjkBDnx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_eqywyFyndjkBDnx
+.L_small_initial_partial_block_eqywyFyndjkBDnx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_eqywyFyndjkBDnx:
+
+ orq %r8,%r8
+ je .L_after_reduction_eqywyFyndjkBDnx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_eqywyFyndjkBDnx:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_9_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_tqfxslkwuCurEnc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_tqfxslkwuCurEnc
+
+.L_16_blocks_overflow_tqfxslkwuCurEnc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_tqfxslkwuCurEnc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pxwcCmexoxpnkgA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pxwcCmexoxpnkgA
+.L_small_initial_partial_block_pxwcCmexoxpnkgA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pxwcCmexoxpnkgA:
+
+ orq %r8,%r8
+ je .L_after_reduction_pxwcCmexoxpnkgA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pxwcCmexoxpnkgA:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_10_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_tiwCrijFxfsopuz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_tiwCrijFxfsopuz
+
+.L_16_blocks_overflow_tiwCrijFxfsopuz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_tiwCrijFxfsopuz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rjgbwiCDGnxhaGp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rjgbwiCDGnxhaGp
+.L_small_initial_partial_block_rjgbwiCDGnxhaGp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rjgbwiCDGnxhaGp:
+
+ orq %r8,%r8
+ je .L_after_reduction_rjgbwiCDGnxhaGp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rjgbwiCDGnxhaGp:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_11_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_wphxdqsnBGrxkBa
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_wphxdqsnBGrxkBa
+
+.L_16_blocks_overflow_wphxdqsnBGrxkBa:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_wphxdqsnBGrxkBa:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DAeDyvlteBcjnnm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DAeDyvlteBcjnnm
+.L_small_initial_partial_block_DAeDyvlteBcjnnm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DAeDyvlteBcjnnm:
+
+ orq %r8,%r8
+ je .L_after_reduction_DAeDyvlteBcjnnm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DAeDyvlteBcjnnm:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_12_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_btzqkvdAeDABvcj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_btzqkvdAeDABvcj
+
+.L_16_blocks_overflow_btzqkvdAeDABvcj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_btzqkvdAeDABvcj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BAFapfuAGyFkstm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BAFapfuAGyFkstm
+.L_small_initial_partial_block_BAFapfuAGyFkstm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BAFapfuAGyFkstm:
+
+ orq %r8,%r8
+ je .L_after_reduction_BAFapfuAGyFkstm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BAFapfuAGyFkstm:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_13_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_eqBacrjkweGnBBv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_eqBacrjkweGnBBv
+
+.L_16_blocks_overflow_eqBacrjkweGnBBv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_eqBacrjkweGnBBv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zzCAagwwuuueoBh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zzCAagwwuuueoBh
+.L_small_initial_partial_block_zzCAagwwuuueoBh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zzCAagwwuuueoBh:
+
+ orq %r8,%r8
+ je .L_after_reduction_zzCAagwwuuueoBh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zzCAagwwuuueoBh:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_14_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_hBvbhuzsjeqFuma
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_hBvbhuzsjeqFuma
+
+.L_16_blocks_overflow_hBvbhuzsjeqFuma:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_hBvbhuzsjeqFuma:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mwionbCzEjjlanp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mwionbCzEjjlanp
+.L_small_initial_partial_block_mwionbCzEjjlanp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mwionbCzEjjlanp:
+
+ orq %r8,%r8
+ je .L_after_reduction_mwionbCzEjjlanp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_mwionbCzEjjlanp:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_15_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_BDaqedvcvzqmjwo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_BDaqedvcvzqmjwo
+
+.L_16_blocks_overflow_BDaqedvcvzqmjwo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_BDaqedvcvzqmjwo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EFDnDGjBfhFbjps
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EFDnDGjBfhFbjps
+.L_small_initial_partial_block_EFDnDGjBfhFbjps:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EFDnDGjBfhFbjps:
+
+ orq %r8,%r8
+ je .L_after_reduction_EFDnDGjBfhFbjps
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_EFDnDGjBfhFbjps:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_16_hpinkedxAsgwrDG:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_etaGdjDbzcppuhm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_etaGdjDbzcppuhm
+
+.L_16_blocks_overflow_etaGdjDbzcppuhm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_etaGdjDbzcppuhm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_zcehcCvffqhlrEC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zcehcCvffqhlrEC:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zcehcCvffqhlrEC:
+ jmp .L_last_blocks_done_hpinkedxAsgwrDG
+.L_last_num_blocks_is_0_hpinkedxAsgwrDG:
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_hpinkedxAsgwrDG:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_brADimEeCnCcDmv
+
+.L_message_below_32_blocks_brADimEeCnCcDmv:
+
+
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_hlnFoocmixcFBsB
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+.L_skip_hkeys_precomputation_hlnFoocmixcFBsB:
+ movq $1,%r14
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_ytkmwztBxmufdeg
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_ytkmwztBxmufdeg
+ jb .L_last_num_blocks_is_7_1_ytkmwztBxmufdeg
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_ytkmwztBxmufdeg
+ jb .L_last_num_blocks_is_11_9_ytkmwztBxmufdeg
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_ytkmwztBxmufdeg
+ ja .L_last_num_blocks_is_16_ytkmwztBxmufdeg
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_ytkmwztBxmufdeg
+ jmp .L_last_num_blocks_is_13_ytkmwztBxmufdeg
+
+.L_last_num_blocks_is_11_9_ytkmwztBxmufdeg:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_ytkmwztBxmufdeg
+ ja .L_last_num_blocks_is_11_ytkmwztBxmufdeg
+ jmp .L_last_num_blocks_is_9_ytkmwztBxmufdeg
+
+.L_last_num_blocks_is_7_1_ytkmwztBxmufdeg:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_ytkmwztBxmufdeg
+ jb .L_last_num_blocks_is_3_1_ytkmwztBxmufdeg
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_ytkmwztBxmufdeg
+ je .L_last_num_blocks_is_6_ytkmwztBxmufdeg
+ jmp .L_last_num_blocks_is_5_ytkmwztBxmufdeg
+
+.L_last_num_blocks_is_3_1_ytkmwztBxmufdeg:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_ytkmwztBxmufdeg
+ je .L_last_num_blocks_is_2_ytkmwztBxmufdeg
+.L_last_num_blocks_is_1_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_bGwqvrBoAiaAwkr
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_bGwqvrBoAiaAwkr
+
+.L_16_blocks_overflow_bGwqvrBoAiaAwkr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_bGwqvrBoAiaAwkr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dqohylvpeBErAsj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dqohylvpeBErAsj
+.L_small_initial_partial_block_dqohylvpeBErAsj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_dqohylvpeBErAsj
+.L_small_initial_compute_done_dqohylvpeBErAsj:
+.L_after_reduction_dqohylvpeBErAsj:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_2_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_lsDChrkFfFrGvvk
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_lsDChrkFfFrGvvk
+
+.L_16_blocks_overflow_lsDChrkFfFrGvvk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_lsDChrkFfFrGvvk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_Bgmdyvgptvfwdit
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_Bgmdyvgptvfwdit
+.L_small_initial_partial_block_Bgmdyvgptvfwdit:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_Bgmdyvgptvfwdit:
+
+ orq %r8,%r8
+ je .L_after_reduction_Bgmdyvgptvfwdit
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_Bgmdyvgptvfwdit:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_3_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_srEocbwAwxsxpma
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_srEocbwAwxsxpma
+
+.L_16_blocks_overflow_srEocbwAwxsxpma:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_srEocbwAwxsxpma:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ErkzfxFAbndCAAg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ErkzfxFAbndCAAg
+.L_small_initial_partial_block_ErkzfxFAbndCAAg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ErkzfxFAbndCAAg:
+
+ orq %r8,%r8
+ je .L_after_reduction_ErkzfxFAbndCAAg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ErkzfxFAbndCAAg:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_4_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_wbyjFiCBFhEhwdm
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_wbyjFiCBFhEhwdm
+
+.L_16_blocks_overflow_wbyjFiCBFhEhwdm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_wbyjFiCBFhEhwdm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sEeExElgbeebmrl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sEeExElgbeebmrl
+.L_small_initial_partial_block_sEeExElgbeebmrl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sEeExElgbeebmrl:
+
+ orq %r8,%r8
+ je .L_after_reduction_sEeExElgbeebmrl
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_sEeExElgbeebmrl:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_5_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_FhnyaskgxleEyeh
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_FhnyaskgxleEyeh
+
+.L_16_blocks_overflow_FhnyaskgxleEyeh:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_FhnyaskgxleEyeh:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wcgcyCwrColDBul
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wcgcyCwrColDBul
+.L_small_initial_partial_block_wcgcyCwrColDBul:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wcgcyCwrColDBul:
+
+ orq %r8,%r8
+ je .L_after_reduction_wcgcyCwrColDBul
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wcgcyCwrColDBul:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_6_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_EfyidiDbmAaAaju
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_EfyidiDbmAaAaju
+
+.L_16_blocks_overflow_EfyidiDbmAaAaju:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_EfyidiDbmAaAaju:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jGjykEdEyDattqe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jGjykEdEyDattqe
+.L_small_initial_partial_block_jGjykEdEyDattqe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jGjykEdEyDattqe:
+
+ orq %r8,%r8
+ je .L_after_reduction_jGjykEdEyDattqe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jGjykEdEyDattqe:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_7_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_mzDdvEgkDwBlewp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_mzDdvEgkDwBlewp
+
+.L_16_blocks_overflow_mzDdvEgkDwBlewp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_mzDdvEgkDwBlewp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zwgGbbACgGfeFja
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zwgGbbACgGfeFja
+.L_small_initial_partial_block_zwgGbbACgGfeFja:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zwgGbbACgGfeFja:
+
+ orq %r8,%r8
+ je .L_after_reduction_zwgGbbACgGfeFja
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zwgGbbACgGfeFja:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_8_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_jqmGdhzdkozCBlA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_jqmGdhzdkozCBlA
+
+.L_16_blocks_overflow_jqmGdhzdkozCBlA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_jqmGdhzdkozCBlA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_Daizbjyimqaduru
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_Daizbjyimqaduru
+.L_small_initial_partial_block_Daizbjyimqaduru:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_Daizbjyimqaduru:
+
+ orq %r8,%r8
+ je .L_after_reduction_Daizbjyimqaduru
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_Daizbjyimqaduru:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_9_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_CDuwyvGbafyeBuk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_CDuwyvGbafyeBuk
+
+.L_16_blocks_overflow_CDuwyvGbafyeBuk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_CDuwyvGbafyeBuk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kpAafwlxkcfbCCh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kpAafwlxkcfbCCh
+.L_small_initial_partial_block_kpAafwlxkcfbCCh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kpAafwlxkcfbCCh:
+
+ orq %r8,%r8
+ je .L_after_reduction_kpAafwlxkcfbCCh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_kpAafwlxkcfbCCh:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_10_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_tDtiElGDCfanulC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_tDtiElGDCfanulC
+
+.L_16_blocks_overflow_tDtiElGDCfanulC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_tDtiElGDCfanulC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zphfokajCjwqcAg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zphfokajCjwqcAg
+.L_small_initial_partial_block_zphfokajCjwqcAg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zphfokajCjwqcAg:
+
+ orq %r8,%r8
+ je .L_after_reduction_zphfokajCjwqcAg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zphfokajCjwqcAg:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_11_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_wqmiytsuGwmqxEk
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_wqmiytsuGwmqxEk
+
+.L_16_blocks_overflow_wqmiytsuGwmqxEk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_wqmiytsuGwmqxEk:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DlBrprmzzykyokm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DlBrprmzzykyokm
+.L_small_initial_partial_block_DlBrprmzzykyokm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DlBrprmzzykyokm:
+
+ orq %r8,%r8
+ je .L_after_reduction_DlBrprmzzykyokm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DlBrprmzzykyokm:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_12_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_annCtoGejoBwwxn
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_annCtoGejoBwwxn
+
+.L_16_blocks_overflow_annCtoGejoBwwxn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_annCtoGejoBwwxn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_viBlGurDavwztrf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_viBlGurDavwztrf
+.L_small_initial_partial_block_viBlGurDavwztrf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_viBlGurDavwztrf:
+
+ orq %r8,%r8
+ je .L_after_reduction_viBlGurDavwztrf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_viBlGurDavwztrf:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_13_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_zmshcCvwkdwGlaB
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_zmshcCvwkdwGlaB
+
+.L_16_blocks_overflow_zmshcCvwkdwGlaB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_zmshcCvwkdwGlaB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kqdfAoFcBDkeGbm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kqdfAoFcBDkeGbm
+.L_small_initial_partial_block_kqdfAoFcBDkeGbm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kqdfAoFcBDkeGbm:
+
+ orq %r8,%r8
+ je .L_after_reduction_kqdfAoFcBDkeGbm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_kqdfAoFcBDkeGbm:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_14_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_boziaaCCygjjfxw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_boziaaCCygjjfxw
+
+.L_16_blocks_overflow_boziaaCCygjjfxw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_boziaaCCygjjfxw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_znbGdxrosrCeabB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_znbGdxrosrCeabB
+.L_small_initial_partial_block_znbGdxrosrCeabB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_znbGdxrosrCeabB:
+
+ orq %r8,%r8
+ je .L_after_reduction_znbGdxrosrCeabB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_znbGdxrosrCeabB:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_15_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_rliugxzwdyFGiBD
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_rliugxzwdyFGiBD
+
+.L_16_blocks_overflow_rliugxzwdyFGiBD:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_rliugxzwdyFGiBD:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_olnbAdcngmvvEdn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_olnbAdcngmvvEdn
+.L_small_initial_partial_block_olnbAdcngmvvEdn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_olnbAdcngmvvEdn:
+
+ orq %r8,%r8
+ je .L_after_reduction_olnbAdcngmvvEdn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_olnbAdcngmvvEdn:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_16_ytkmwztBxmufdeg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_gmEGrjFikmwGcAm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_gmEGrjFikmwGcAm
+
+.L_16_blocks_overflow_gmEGrjFikmwGcAm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_gmEGrjFikmwGcAm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_dplntcAkoiBEkDo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dplntcAkoiBEkDo:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dplntcAkoiBEkDo:
+ jmp .L_last_blocks_done_ytkmwztBxmufdeg
+.L_last_num_blocks_is_0_ytkmwztBxmufdeg:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_ytkmwztBxmufdeg:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_brADimEeCnCcDmv
+
+.L_message_below_equal_16_blocks_brADimEeCnCcDmv:
+
+
+ movl %r8d,%r12d
+ addl $15,%r12d
+ shrl $4,%r12d
+ cmpq $8,%r12
+ je .L_small_initial_num_blocks_is_8_nmhEfDfgEBvcjnt
+ jl .L_small_initial_num_blocks_is_7_1_nmhEfDfgEBvcjnt
+
+
+ cmpq $12,%r12
+ je .L_small_initial_num_blocks_is_12_nmhEfDfgEBvcjnt
+ jl .L_small_initial_num_blocks_is_11_9_nmhEfDfgEBvcjnt
+
+
+ cmpq $16,%r12
+ je .L_small_initial_num_blocks_is_16_nmhEfDfgEBvcjnt
+ cmpq $15,%r12
+ je .L_small_initial_num_blocks_is_15_nmhEfDfgEBvcjnt
+ cmpq $14,%r12
+ je .L_small_initial_num_blocks_is_14_nmhEfDfgEBvcjnt
+ jmp .L_small_initial_num_blocks_is_13_nmhEfDfgEBvcjnt
+
+.L_small_initial_num_blocks_is_11_9_nmhEfDfgEBvcjnt:
+
+ cmpq $11,%r12
+ je .L_small_initial_num_blocks_is_11_nmhEfDfgEBvcjnt
+ cmpq $10,%r12
+ je .L_small_initial_num_blocks_is_10_nmhEfDfgEBvcjnt
+ jmp .L_small_initial_num_blocks_is_9_nmhEfDfgEBvcjnt
+
+.L_small_initial_num_blocks_is_7_1_nmhEfDfgEBvcjnt:
+ cmpq $4,%r12
+ je .L_small_initial_num_blocks_is_4_nmhEfDfgEBvcjnt
+ jl .L_small_initial_num_blocks_is_3_1_nmhEfDfgEBvcjnt
+
+ cmpq $7,%r12
+ je .L_small_initial_num_blocks_is_7_nmhEfDfgEBvcjnt
+ cmpq $6,%r12
+ je .L_small_initial_num_blocks_is_6_nmhEfDfgEBvcjnt
+ jmp .L_small_initial_num_blocks_is_5_nmhEfDfgEBvcjnt
+
+.L_small_initial_num_blocks_is_3_1_nmhEfDfgEBvcjnt:
+
+ cmpq $3,%r12
+ je .L_small_initial_num_blocks_is_3_nmhEfDfgEBvcjnt
+ cmpq $2,%r12
+ je .L_small_initial_num_blocks_is_2_nmhEfDfgEBvcjnt
+
+
+
+
+
+.L_small_initial_num_blocks_is_1_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%xmm29
+ vpaddd ONE(%rip),%xmm2,%xmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vpshufb %xmm29,%xmm0,%xmm0
+ vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %xmm15,%xmm0,%xmm0
+ vpxorq %xmm6,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm6,%xmm6
+ vextracti32x4 $0,%zmm6,%xmm13
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AyfivemhvfDjwew
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AyfivemhvfDjwew
+.L_small_initial_partial_block_AyfivemhvfDjwew:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm13,%xmm14,%xmm14
+
+ jmp .L_after_reduction_AyfivemhvfDjwew
+.L_small_initial_compute_done_AyfivemhvfDjwew:
+.L_after_reduction_AyfivemhvfDjwew:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_2_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%ymm29
+ vshufi64x2 $0,%ymm2,%ymm2,%ymm0
+ vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vpshufb %ymm29,%ymm0,%ymm0
+ vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %ymm15,%ymm0,%ymm0
+ vpxorq %ymm6,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm6,%ymm6
+ vextracti32x4 $1,%zmm6,%xmm13
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mFdfDiDtuhyrCwk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mFdfDiDtuhyrCwk
+.L_small_initial_partial_block_mFdfDiDtuhyrCwk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mFdfDiDtuhyrCwk:
+
+ orq %r8,%r8
+ je .L_after_reduction_mFdfDiDtuhyrCwk
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_mFdfDiDtuhyrCwk:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_3_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vextracti32x4 $2,%zmm6,%xmm13
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AvGtGumzxshjiFB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AvGtGumzxshjiFB
+.L_small_initial_partial_block_AvGtGumzxshjiFB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AvGtGumzxshjiFB:
+
+ orq %r8,%r8
+ je .L_after_reduction_AvGtGumzxshjiFB
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_AvGtGumzxshjiFB:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_4_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vextracti32x4 $3,%zmm6,%xmm13
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DbentnbaeCzAufz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DbentnbaeCzAufz
+.L_small_initial_partial_block_DbentnbaeCzAufz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DbentnbaeCzAufz:
+
+ orq %r8,%r8
+ je .L_after_reduction_DbentnbaeCzAufz
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_DbentnbaeCzAufz:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_5_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %xmm7,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %xmm29,%xmm7,%xmm7
+ vextracti32x4 $0,%zmm7,%xmm13
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dnEAtijzGEDlswn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dnEAtijzGEDlswn
+.L_small_initial_partial_block_dnEAtijzGEDlswn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dnEAtijzGEDlswn:
+
+ orq %r8,%r8
+ je .L_after_reduction_dnEAtijzGEDlswn
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_dnEAtijzGEDlswn:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_6_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %ymm15,%ymm3,%ymm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %ymm7,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %ymm29,%ymm7,%ymm7
+ vextracti32x4 $1,%zmm7,%xmm13
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_umqipkezFkCyFdu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_umqipkezFkCyFdu
+.L_small_initial_partial_block_umqipkezFkCyFdu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_umqipkezFkCyFdu:
+
+ orq %r8,%r8
+ je .L_after_reduction_umqipkezFkCyFdu
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_umqipkezFkCyFdu:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_7_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vextracti32x4 $2,%zmm7,%xmm13
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lEGtnzekhyuwBFz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lEGtnzekhyuwBFz
+.L_small_initial_partial_block_lEGtnzekhyuwBFz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lEGtnzekhyuwBFz:
+
+ orq %r8,%r8
+ je .L_after_reduction_lEGtnzekhyuwBFz
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_lEGtnzekhyuwBFz:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_8_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vextracti32x4 $3,%zmm7,%xmm13
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EasGBEsimbhszDy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EasGBEsimbhszDy
+.L_small_initial_partial_block_EasGBEsimbhszDy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EasGBEsimbhszDy:
+
+ orq %r8,%r8
+ je .L_after_reduction_EasGBEsimbhszDy
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_EasGBEsimbhszDy:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_9_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %xmm10,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %xmm29,%xmm10,%xmm10
+ vextracti32x4 $0,%zmm10,%xmm13
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DlhndmhlkxypvAb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DlhndmhlkxypvAb
+.L_small_initial_partial_block_DlhndmhlkxypvAb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DlhndmhlkxypvAb:
+
+ orq %r8,%r8
+ je .L_after_reduction_DlhndmhlkxypvAb
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_DlhndmhlkxypvAb:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_10_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %ymm15,%ymm4,%ymm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %ymm10,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %ymm29,%ymm10,%ymm10
+ vextracti32x4 $1,%zmm10,%xmm13
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cwsdomEqheptkED
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cwsdomEqheptkED
+.L_small_initial_partial_block_cwsdomEqheptkED:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cwsdomEqheptkED:
+
+ orq %r8,%r8
+ je .L_after_reduction_cwsdomEqheptkED
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_cwsdomEqheptkED:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_11_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vextracti32x4 $2,%zmm10,%xmm13
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qxeFvgzdwFFywqx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qxeFvgzdwFFywqx
+.L_small_initial_partial_block_qxeFvgzdwFFywqx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qxeFvgzdwFFywqx:
+
+ orq %r8,%r8
+ je .L_after_reduction_qxeFvgzdwFFywqx
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_qxeFvgzdwFFywqx:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_12_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vextracti32x4 $3,%zmm10,%xmm13
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oqzAvlGuDiExAmm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oqzAvlGuDiExAmm
+.L_small_initial_partial_block_oqzAvlGuDiExAmm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oqzAvlGuDiExAmm:
+
+ orq %r8,%r8
+ je .L_after_reduction_oqzAvlGuDiExAmm
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_oqzAvlGuDiExAmm:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_13_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %xmm11,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %xmm29,%xmm11,%xmm11
+ vextracti32x4 $0,%zmm11,%xmm13
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yqGygqlhwnnpjbq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yqGygqlhwnnpjbq
+.L_small_initial_partial_block_yqGygqlhwnnpjbq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yqGygqlhwnnpjbq:
+
+ orq %r8,%r8
+ je .L_after_reduction_yqGygqlhwnnpjbq
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_yqGygqlhwnnpjbq:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_14_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %ymm15,%ymm5,%ymm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %ymm11,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %ymm29,%ymm11,%ymm11
+ vextracti32x4 $1,%zmm11,%xmm13
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wByexunpeunlcgC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wByexunpeunlcgC
+.L_small_initial_partial_block_wByexunpeunlcgC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wByexunpeunlcgC:
+
+ orq %r8,%r8
+ je .L_after_reduction_wByexunpeunlcgC
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_wByexunpeunlcgC:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_15_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vextracti32x4 $2,%zmm11,%xmm13
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sAhCDvCwGcBErvs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sAhCDvCwGcBErvs
+.L_small_initial_partial_block_sAhCDvCwGcBErvs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sAhCDvCwGcBErvs:
+
+ orq %r8,%r8
+ je .L_after_reduction_sAhCDvCwGcBErvs
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_sAhCDvCwGcBErvs:
+ jmp .L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt
+.L_small_initial_num_blocks_is_16_nmhEfDfgEBvcjnt:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vextracti32x4 $3,%zmm11,%xmm13
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_BGcpniuuBjzyonj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BGcpniuuBjzyonj:
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_BGcpniuuBjzyonj:
+.L_small_initial_blocks_encrypted_nmhEfDfgEBvcjnt:
+.L_ghash_done_brADimEeCnCcDmv:
+ vmovdqu64 %xmm2,0(%rsi)
+ vmovdqu64 %xmm14,64(%rsi)
+.L_enc_dec_done_brADimEeCnCcDmv:
+ jmp .Lexit_gcm_decrypt
+.align 32
+.Laes_gcm_decrypt_192_avx512:
+ orq %r8,%r8
+ je .L_enc_dec_done_yiifChpfBbxhAhe
+ xorq %r14,%r14
+ vmovdqu64 64(%rsi),%xmm14
+
+ movq (%rdx),%r11
+ orq %r11,%r11
+ je .L_partial_block_done_EexishzBqqwurDt
+ movl $16,%r10d
+ leaq byte_len_to_mask_table(%rip),%r12
+ cmpq %r10,%r8
+ cmovcq %r8,%r10
+ kmovw (%r12,%r10,2),%k1
+ vmovdqu8 (%rcx),%xmm0{%k1}{z}
+
+ vmovdqu64 16(%rsi),%xmm3
+ vmovdqu64 336(%rsi),%xmm4
+
+
+
+ leaq SHIFT_MASK(%rip),%r12
+ addq %r11,%r12
+ vmovdqu64 (%r12),%xmm5
+ vpshufb %xmm5,%xmm3,%xmm3
+
+ vmovdqa64 %xmm0,%xmm6
+ vpxorq %xmm0,%xmm3,%xmm3
+
+
+ leaq (%r8,%r11,1),%r13
+ subq $16,%r13
+ jge .L_no_extra_mask_EexishzBqqwurDt
+ subq %r13,%r12
+.L_no_extra_mask_EexishzBqqwurDt:
+
+
+
+ vmovdqu64 16(%r12),%xmm0
+ vpand %xmm0,%xmm3,%xmm3
+ vpand %xmm0,%xmm6,%xmm6
+ vpshufb SHUF_MASK(%rip),%xmm6,%xmm6
+ vpshufb %xmm5,%xmm6,%xmm6
+ vpxorq %xmm6,%xmm14,%xmm14
+ cmpq $0,%r13
+ jl .L_partial_incomplete_EexishzBqqwurDt
+
+ vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
+ vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
+ vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
+ vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm14,%xmm14
+
+ vpsrldq $8,%xmm14,%xmm11
+ vpslldq $8,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm7,%xmm7
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm11
+
+ vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
+ vpslldq $8,%xmm10,%xmm10
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
+ vpsrldq $4,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+
+ vpternlogq $0x96,%xmm10,%xmm7,%xmm14
+
+ movq $0,(%rdx)
+
+ movq %r11,%r12
+ movq $16,%r11
+ subq %r12,%r11
+ jmp .L_enc_dec_done_EexishzBqqwurDt
+
+.L_partial_incomplete_EexishzBqqwurDt:
+ addq %r8,(%rdx)
+ movq %r8,%r11
+
+.L_enc_dec_done_EexishzBqqwurDt:
+
+
+ leaq byte_len_to_mask_table(%rip),%r12
+ kmovw (%r12,%r11,2),%k1
+ vmovdqu64 %xmm14,64(%rsi)
+ movq %r9,%r12
+ vmovdqu8 %xmm3,(%r12){%k1}
+.L_partial_block_done_EexishzBqqwurDt:
+ vmovdqu64 0(%rsi),%xmm2
+ subq %r11,%r8
+ je .L_enc_dec_done_yiifChpfBbxhAhe
+ cmpq $256,%r8
+ jbe .L_message_below_equal_16_blocks_yiifChpfBbxhAhe
+
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vmovdqa64 ddq_addbe_4444(%rip),%zmm27
+ vmovdqa64 ddq_addbe_1234(%rip),%zmm28
+
+
+
+
+
+
+ vmovd %xmm2,%r15d
+ andl $255,%r15d
+
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpshufb %zmm29,%zmm2,%zmm2
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_tfgagBztCGiipfj
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_tfgagBztCGiipfj
+.L_next_16_overflow_tfgagBztCGiipfj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_tfgagBztCGiipfj:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm0
+ vmovdqu8 64(%rcx,%r11,1),%zmm3
+ vmovdqu8 128(%rcx,%r11,1),%zmm4
+ vmovdqu8 192(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,0(%r10,%r11,1)
+ vmovdqu8 %zmm10,64(%r10,%r11,1)
+ vmovdqu8 %zmm11,128(%r10,%r11,1)
+ vmovdqu8 %zmm12,192(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm0,%zmm7
+ vpshufb %zmm29,%zmm3,%zmm10
+ vpshufb %zmm29,%zmm4,%zmm11
+ vpshufb %zmm29,%zmm5,%zmm12
+ vmovdqa64 %zmm7,768(%rsp)
+ vmovdqa64 %zmm10,832(%rsp)
+ vmovdqa64 %zmm11,896(%rsp)
+ vmovdqa64 %zmm12,960(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_wuytBaevFghAmde
+
+ vmovdqu64 288(%rsi),%zmm0
+ vmovdqu64 %zmm0,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm3
+ vmovdqu64 %zmm3,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 160(%rsi),%zmm4
+ vmovdqu64 %zmm4,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm5
+ vmovdqu64 %zmm5,512(%rsp)
+.L_skip_hkeys_precomputation_wuytBaevFghAmde:
+ cmpq $512,%r8
+ jb .L_message_below_32_blocks_yiifChpfBbxhAhe
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_nzEGCllDaFxsseu
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_nzEGCllDaFxsseu
+.L_next_16_overflow_nzEGCllDaFxsseu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_nzEGCllDaFxsseu:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm0
+ vmovdqu8 320(%rcx,%r11,1),%zmm3
+ vmovdqu8 384(%rcx,%r11,1),%zmm4
+ vmovdqu8 448(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,256(%r10,%r11,1)
+ vmovdqu8 %zmm10,320(%r10,%r11,1)
+ vmovdqu8 %zmm11,384(%r10,%r11,1)
+ vmovdqu8 %zmm12,448(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm0,%zmm7
+ vpshufb %zmm29,%zmm3,%zmm10
+ vpshufb %zmm29,%zmm4,%zmm11
+ vpshufb %zmm29,%zmm5,%zmm12
+ vmovdqa64 %zmm7,1024(%rsp)
+ vmovdqa64 %zmm10,1088(%rsp)
+ vmovdqa64 %zmm11,1152(%rsp)
+ vmovdqa64 %zmm12,1216(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_CDApkmzFaysFbmb
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,0(%rsp)
+.L_skip_hkeys_precomputation_CDApkmzFaysFbmb:
+ movq $1,%r14
+ addq $512,%r11
+ subq $512,%r8
+
+ cmpq $768,%r8
+ jb .L_no_more_big_nblocks_yiifChpfBbxhAhe
+.L_encrypt_big_nblocks_yiifChpfBbxhAhe:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_EkchfDegrAlelEj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_EkchfDegrAlelEj
+.L_16_blocks_overflow_EkchfDegrAlelEj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_EkchfDegrAlelEj:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_ymdbteyxuoqtqnl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_ymdbteyxuoqtqnl
+.L_16_blocks_overflow_ymdbteyxuoqtqnl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_ymdbteyxuoqtqnl:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_tyfBFhaGurfjEFr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_tyfBFhaGurfjEFr
+.L_16_blocks_overflow_tyfBFhaGurfjEFr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_tyfBFhaGurfjEFr:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 512(%rcx,%r11,1),%zmm17
+ vmovdqu8 576(%rcx,%r11,1),%zmm19
+ vmovdqu8 640(%rcx,%r11,1),%zmm20
+ vmovdqu8 704(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+
+
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpternlogq $0x96,%zmm15,%zmm12,%zmm6
+ vpxorq %zmm24,%zmm6,%zmm6
+ vpternlogq $0x96,%zmm10,%zmm13,%zmm7
+ vpxorq %zmm25,%zmm7,%zmm7
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vextracti64x4 $1,%zmm6,%ymm12
+ vpxorq %ymm12,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm12
+ vpxorq %xmm12,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm6
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,512(%r10,%r11,1)
+ vmovdqu8 %zmm3,576(%r10,%r11,1)
+ vmovdqu8 %zmm4,640(%r10,%r11,1)
+ vmovdqu8 %zmm5,704(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1024(%rsp)
+ vmovdqa64 %zmm3,1088(%rsp)
+ vmovdqa64 %zmm4,1152(%rsp)
+ vmovdqa64 %zmm5,1216(%rsp)
+ vmovdqa64 %zmm6,%zmm14
+
+ addq $768,%r11
+ subq $768,%r8
+ cmpq $768,%r8
+ jae .L_encrypt_big_nblocks_yiifChpfBbxhAhe
+
+.L_no_more_big_nblocks_yiifChpfBbxhAhe:
+
+ cmpq $512,%r8
+ jae .L_encrypt_32_blocks_yiifChpfBbxhAhe
+
+ cmpq $256,%r8
+ jae .L_encrypt_16_blocks_yiifChpfBbxhAhe
+.L_encrypt_0_blocks_ghash_32_yiifChpfBbxhAhe:
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $256,%ebx
+ subl %r10d,%ebx
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ addl $256,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_EnDAnndDABDpwrg
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_EnDAnndDABDpwrg
+ jb .L_last_num_blocks_is_7_1_EnDAnndDABDpwrg
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_EnDAnndDABDpwrg
+ jb .L_last_num_blocks_is_11_9_EnDAnndDABDpwrg
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_EnDAnndDABDpwrg
+ ja .L_last_num_blocks_is_16_EnDAnndDABDpwrg
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_EnDAnndDABDpwrg
+ jmp .L_last_num_blocks_is_13_EnDAnndDABDpwrg
+
+.L_last_num_blocks_is_11_9_EnDAnndDABDpwrg:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_EnDAnndDABDpwrg
+ ja .L_last_num_blocks_is_11_EnDAnndDABDpwrg
+ jmp .L_last_num_blocks_is_9_EnDAnndDABDpwrg
+
+.L_last_num_blocks_is_7_1_EnDAnndDABDpwrg:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_EnDAnndDABDpwrg
+ jb .L_last_num_blocks_is_3_1_EnDAnndDABDpwrg
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_EnDAnndDABDpwrg
+ je .L_last_num_blocks_is_6_EnDAnndDABDpwrg
+ jmp .L_last_num_blocks_is_5_EnDAnndDABDpwrg
+
+.L_last_num_blocks_is_3_1_EnDAnndDABDpwrg:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_EnDAnndDABDpwrg
+ je .L_last_num_blocks_is_2_EnDAnndDABDpwrg
+.L_last_num_blocks_is_1_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_GgCAgFtCzDDmtga
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_GgCAgFtCzDDmtga
+
+.L_16_blocks_overflow_GgCAgFtCzDDmtga:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_GgCAgFtCzDDmtga:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_muErgpqjgcDnuvy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_muErgpqjgcDnuvy
+.L_small_initial_partial_block_muErgpqjgcDnuvy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_muErgpqjgcDnuvy
+.L_small_initial_compute_done_muErgpqjgcDnuvy:
+.L_after_reduction_muErgpqjgcDnuvy:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_2_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_sGdlxeauwrjkrtA
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_sGdlxeauwrjkrtA
+
+.L_16_blocks_overflow_sGdlxeauwrjkrtA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_sGdlxeauwrjkrtA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mixrqrhnvplnBsa
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mixrqrhnvplnBsa
+.L_small_initial_partial_block_mixrqrhnvplnBsa:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mixrqrhnvplnBsa:
+
+ orq %r8,%r8
+ je .L_after_reduction_mixrqrhnvplnBsa
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_mixrqrhnvplnBsa:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_3_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_whibjFbDFpmwsdg
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_whibjFbDFpmwsdg
+
+.L_16_blocks_overflow_whibjFbDFpmwsdg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_whibjFbDFpmwsdg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lAnoBCFfkdkhBpw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lAnoBCFfkdkhBpw
+.L_small_initial_partial_block_lAnoBCFfkdkhBpw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lAnoBCFfkdkhBpw:
+
+ orq %r8,%r8
+ je .L_after_reduction_lAnoBCFfkdkhBpw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lAnoBCFfkdkhBpw:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_4_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_CACaGmtylGFBBes
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_CACaGmtylGFBBes
+
+.L_16_blocks_overflow_CACaGmtylGFBBes:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_CACaGmtylGFBBes:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bDpjzbsFvemyBzb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bDpjzbsFvemyBzb
+.L_small_initial_partial_block_bDpjzbsFvemyBzb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bDpjzbsFvemyBzb:
+
+ orq %r8,%r8
+ je .L_after_reduction_bDpjzbsFvemyBzb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bDpjzbsFvemyBzb:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_5_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_imFzBFrgiBtDFwx
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_imFzBFrgiBtDFwx
+
+.L_16_blocks_overflow_imFzBFrgiBtDFwx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_imFzBFrgiBtDFwx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vnnCjDqmzbcdpik
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vnnCjDqmzbcdpik
+.L_small_initial_partial_block_vnnCjDqmzbcdpik:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vnnCjDqmzbcdpik:
+
+ orq %r8,%r8
+ je .L_after_reduction_vnnCjDqmzbcdpik
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vnnCjDqmzbcdpik:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_6_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_pAdtiatocvAeptw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_pAdtiatocvAeptw
+
+.L_16_blocks_overflow_pAdtiatocvAeptw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_pAdtiatocvAeptw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gvfhgipCiigqdGj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gvfhgipCiigqdGj
+.L_small_initial_partial_block_gvfhgipCiigqdGj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gvfhgipCiigqdGj:
+
+ orq %r8,%r8
+ je .L_after_reduction_gvfhgipCiigqdGj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gvfhgipCiigqdGj:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_7_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_xxGFqeesBsuBajd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_xxGFqeesBsuBajd
+
+.L_16_blocks_overflow_xxGFqeesBsuBajd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_xxGFqeesBsuBajd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nFyvcbadpdjqnGl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nFyvcbadpdjqnGl
+.L_small_initial_partial_block_nFyvcbadpdjqnGl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_nFyvcbadpdjqnGl:
+
+ orq %r8,%r8
+ je .L_after_reduction_nFyvcbadpdjqnGl
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_nFyvcbadpdjqnGl:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_8_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_qtzDbmlGiqglyFC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_qtzDbmlGiqglyFC
+
+.L_16_blocks_overflow_qtzDbmlGiqglyFC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_qtzDbmlGiqglyFC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jhfdGzoqFGvFnBz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jhfdGzoqFGvFnBz
+.L_small_initial_partial_block_jhfdGzoqFGvFnBz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jhfdGzoqFGvFnBz:
+
+ orq %r8,%r8
+ je .L_after_reduction_jhfdGzoqFGvFnBz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jhfdGzoqFGvFnBz:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_9_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_wmBlfbGwbkoxgju
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_wmBlfbGwbkoxgju
+
+.L_16_blocks_overflow_wmBlfbGwbkoxgju:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_wmBlfbGwbkoxgju:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_baszqDAmduvhiiE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_baszqDAmduvhiiE
+.L_small_initial_partial_block_baszqDAmduvhiiE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_baszqDAmduvhiiE:
+
+ orq %r8,%r8
+ je .L_after_reduction_baszqDAmduvhiiE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_baszqDAmduvhiiE:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_10_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_stwxpAgbfshrvAC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_stwxpAgbfshrvAC
+
+.L_16_blocks_overflow_stwxpAgbfshrvAC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_stwxpAgbfshrvAC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_exAeuCGujFxiqAh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_exAeuCGujFxiqAh
+.L_small_initial_partial_block_exAeuCGujFxiqAh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_exAeuCGujFxiqAh:
+
+ orq %r8,%r8
+ je .L_after_reduction_exAeuCGujFxiqAh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_exAeuCGujFxiqAh:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_11_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_AxBbgslpvfAEaln
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_AxBbgslpvfAEaln
+
+.L_16_blocks_overflow_AxBbgslpvfAEaln:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_AxBbgslpvfAEaln:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DbcpAfrkzFcgwwp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DbcpAfrkzFcgwwp
+.L_small_initial_partial_block_DbcpAfrkzFcgwwp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DbcpAfrkzFcgwwp:
+
+ orq %r8,%r8
+ je .L_after_reduction_DbcpAfrkzFcgwwp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DbcpAfrkzFcgwwp:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_12_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_smrhssarGEoyasa
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_smrhssarGEoyasa
+
+.L_16_blocks_overflow_smrhssarGEoyasa:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_smrhssarGEoyasa:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rouvbBEfwtDrsEg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rouvbBEfwtDrsEg
+.L_small_initial_partial_block_rouvbBEfwtDrsEg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rouvbBEfwtDrsEg:
+
+ orq %r8,%r8
+ je .L_after_reduction_rouvbBEfwtDrsEg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rouvbBEfwtDrsEg:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_13_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_DrfxGvBzxdbnqak
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_DrfxGvBzxdbnqak
+
+.L_16_blocks_overflow_DrfxGvBzxdbnqak:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_DrfxGvBzxdbnqak:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wcayAkkuiehcgnC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wcayAkkuiehcgnC
+.L_small_initial_partial_block_wcayAkkuiehcgnC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wcayAkkuiehcgnC:
+
+ orq %r8,%r8
+ je .L_after_reduction_wcayAkkuiehcgnC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wcayAkkuiehcgnC:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_14_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_kAcyvjjAkbnGGoE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_kAcyvjjAkbnGGoE
+
+.L_16_blocks_overflow_kAcyvjjAkbnGGoE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_kAcyvjjAkbnGGoE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lECstFkGozakhDE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lECstFkGozakhDE
+.L_small_initial_partial_block_lECstFkGozakhDE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lECstFkGozakhDE:
+
+ orq %r8,%r8
+ je .L_after_reduction_lECstFkGozakhDE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lECstFkGozakhDE:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_15_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_uvsntmjBtmwoAgA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_uvsntmjBtmwoAgA
+
+.L_16_blocks_overflow_uvsntmjBtmwoAgA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_uvsntmjBtmwoAgA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gFfyGkDCahpvfAe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gFfyGkDCahpvfAe
+.L_small_initial_partial_block_gFfyGkDCahpvfAe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gFfyGkDCahpvfAe:
+
+ orq %r8,%r8
+ je .L_after_reduction_gFfyGkDCahpvfAe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gFfyGkDCahpvfAe:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_16_EnDAnndDABDpwrg:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_jwffjzkjrdbGmqd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_jwffjzkjrdbGmqd
+
+.L_16_blocks_overflow_jwffjzkjrdbGmqd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_jwffjzkjrdbGmqd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_ccvdpppmDomgiCD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ccvdpppmDomgiCD:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ccvdpppmDomgiCD:
+ jmp .L_last_blocks_done_EnDAnndDABDpwrg
+.L_last_num_blocks_is_0_EnDAnndDABDpwrg:
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_EnDAnndDABDpwrg:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_yiifChpfBbxhAhe
+.L_encrypt_32_blocks_yiifChpfBbxhAhe:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_igclhxhftlBGfml
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_igclhxhftlBGfml
+.L_16_blocks_overflow_igclhxhftlBGfml:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_igclhxhftlBGfml:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_hgchDvhDwhDhkhj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_hgchDvhDwhDhkhj
+.L_16_blocks_overflow_hgchDvhDwhDhkhj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_hgchDvhDwhDhkhj:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+ subq $512,%r8
+ addq $512,%r11
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_mzebEnFmrFgqunA
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_mzebEnFmrFgqunA
+ jb .L_last_num_blocks_is_7_1_mzebEnFmrFgqunA
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_mzebEnFmrFgqunA
+ jb .L_last_num_blocks_is_11_9_mzebEnFmrFgqunA
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_mzebEnFmrFgqunA
+ ja .L_last_num_blocks_is_16_mzebEnFmrFgqunA
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_mzebEnFmrFgqunA
+ jmp .L_last_num_blocks_is_13_mzebEnFmrFgqunA
+
+.L_last_num_blocks_is_11_9_mzebEnFmrFgqunA:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_mzebEnFmrFgqunA
+ ja .L_last_num_blocks_is_11_mzebEnFmrFgqunA
+ jmp .L_last_num_blocks_is_9_mzebEnFmrFgqunA
+
+.L_last_num_blocks_is_7_1_mzebEnFmrFgqunA:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_mzebEnFmrFgqunA
+ jb .L_last_num_blocks_is_3_1_mzebEnFmrFgqunA
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_mzebEnFmrFgqunA
+ je .L_last_num_blocks_is_6_mzebEnFmrFgqunA
+ jmp .L_last_num_blocks_is_5_mzebEnFmrFgqunA
+
+.L_last_num_blocks_is_3_1_mzebEnFmrFgqunA:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_mzebEnFmrFgqunA
+ je .L_last_num_blocks_is_2_mzebEnFmrFgqunA
+.L_last_num_blocks_is_1_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_nGCoqEFBGnmxbxd
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_nGCoqEFBGnmxbxd
+
+.L_16_blocks_overflow_nGCoqEFBGnmxbxd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_nGCoqEFBGnmxbxd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pteDFgEDjspDekt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pteDFgEDjspDekt
+.L_small_initial_partial_block_pteDFgEDjspDekt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_pteDFgEDjspDekt
+.L_small_initial_compute_done_pteDFgEDjspDekt:
+.L_after_reduction_pteDFgEDjspDekt:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_2_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_BnoeeeAuxpuGrCd
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_BnoeeeAuxpuGrCd
+
+.L_16_blocks_overflow_BnoeeeAuxpuGrCd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_BnoeeeAuxpuGrCd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pGCaGvdapDriFwq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pGCaGvdapDriFwq
+.L_small_initial_partial_block_pGCaGvdapDriFwq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pGCaGvdapDriFwq:
+
+ orq %r8,%r8
+ je .L_after_reduction_pGCaGvdapDriFwq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pGCaGvdapDriFwq:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_3_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_rpvBmmdleounkfg
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_rpvBmmdleounkfg
+
+.L_16_blocks_overflow_rpvBmmdleounkfg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_rpvBmmdleounkfg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_EDfFbxCoAeBbBmG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_EDfFbxCoAeBbBmG
+.L_small_initial_partial_block_EDfFbxCoAeBbBmG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_EDfFbxCoAeBbBmG:
+
+ orq %r8,%r8
+ je .L_after_reduction_EDfFbxCoAeBbBmG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_EDfFbxCoAeBbBmG:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_4_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_eejufxFfpkhainn
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_eejufxFfpkhainn
+
+.L_16_blocks_overflow_eejufxFfpkhainn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_eejufxFfpkhainn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rtqFkraGudeyaFm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rtqFkraGudeyaFm
+.L_small_initial_partial_block_rtqFkraGudeyaFm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rtqFkraGudeyaFm:
+
+ orq %r8,%r8
+ je .L_after_reduction_rtqFkraGudeyaFm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rtqFkraGudeyaFm:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_5_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_bgofyFpgEnsntBw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_bgofyFpgEnsntBw
+
+.L_16_blocks_overflow_bgofyFpgEnsntBw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_bgofyFpgEnsntBw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uCfkbGGrphGcGba
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uCfkbGGrphGcGba
+.L_small_initial_partial_block_uCfkbGGrphGcGba:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uCfkbGGrphGcGba:
+
+ orq %r8,%r8
+ je .L_after_reduction_uCfkbGGrphGcGba
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uCfkbGGrphGcGba:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_6_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_GvptlszrGgmFuve
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_GvptlszrGgmFuve
+
+.L_16_blocks_overflow_GvptlszrGgmFuve:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_GvptlszrGgmFuve:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oFAlvAhpbuuoctp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oFAlvAhpbuuoctp
+.L_small_initial_partial_block_oFAlvAhpbuuoctp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oFAlvAhpbuuoctp:
+
+ orq %r8,%r8
+ je .L_after_reduction_oFAlvAhpbuuoctp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oFAlvAhpbuuoctp:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_7_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_DxbjcygrgxudEjb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_DxbjcygrgxudEjb
+
+.L_16_blocks_overflow_DxbjcygrgxudEjb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_DxbjcygrgxudEjb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xFeGbEcEyBujjsd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xFeGbEcEyBujjsd
+.L_small_initial_partial_block_xFeGbEcEyBujjsd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xFeGbEcEyBujjsd:
+
+ orq %r8,%r8
+ je .L_after_reduction_xFeGbEcEyBujjsd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xFeGbEcEyBujjsd:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_8_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_njjFmdkzFAzEDDa
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_njjFmdkzFAzEDDa
+
+.L_16_blocks_overflow_njjFmdkzFAzEDDa:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_njjFmdkzFAzEDDa:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ozrwtEFqpzbbFif
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ozrwtEFqpzbbFif
+.L_small_initial_partial_block_ozrwtEFqpzbbFif:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ozrwtEFqpzbbFif:
+
+ orq %r8,%r8
+ je .L_after_reduction_ozrwtEFqpzbbFif
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ozrwtEFqpzbbFif:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_9_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_tzqaclAtnqeEABy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_tzqaclAtnqeEABy
+
+.L_16_blocks_overflow_tzqaclAtnqeEABy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_tzqaclAtnqeEABy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_akxrmDCvAwmtoBq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_akxrmDCvAwmtoBq
+.L_small_initial_partial_block_akxrmDCvAwmtoBq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_akxrmDCvAwmtoBq:
+
+ orq %r8,%r8
+ je .L_after_reduction_akxrmDCvAwmtoBq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_akxrmDCvAwmtoBq:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_10_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_mdrttBDhusakuks
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_mdrttBDhusakuks
+
+.L_16_blocks_overflow_mdrttBDhusakuks:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_mdrttBDhusakuks:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_iAgGclofsEyxAFd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_iAgGclofsEyxAFd
+.L_small_initial_partial_block_iAgGclofsEyxAFd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_iAgGclofsEyxAFd:
+
+ orq %r8,%r8
+ je .L_after_reduction_iAgGclofsEyxAFd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_iAgGclofsEyxAFd:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_11_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_dngFDcgnxjanBrr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_dngFDcgnxjanBrr
+
+.L_16_blocks_overflow_dngFDcgnxjanBrr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_dngFDcgnxjanBrr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_okvBnGbFccGxioi
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_okvBnGbFccGxioi
+.L_small_initial_partial_block_okvBnGbFccGxioi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_okvBnGbFccGxioi:
+
+ orq %r8,%r8
+ je .L_after_reduction_okvBnGbFccGxioi
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_okvBnGbFccGxioi:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_12_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_aubdtmlCEjgrkqC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_aubdtmlCEjgrkqC
+
+.L_16_blocks_overflow_aubdtmlCEjgrkqC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_aubdtmlCEjgrkqC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fAvjEssplkpFDzu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fAvjEssplkpFDzu
+.L_small_initial_partial_block_fAvjEssplkpFDzu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fAvjEssplkpFDzu:
+
+ orq %r8,%r8
+ je .L_after_reduction_fAvjEssplkpFDzu
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fAvjEssplkpFDzu:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_13_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_tgGfmxsfvvfjlut
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_tgGfmxsfvvfjlut
+
+.L_16_blocks_overflow_tgGfmxsfvvfjlut:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_tgGfmxsfvvfjlut:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dGgFeCerpjagCtb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dGgFeCerpjagCtb
+.L_small_initial_partial_block_dGgFeCerpjagCtb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dGgFeCerpjagCtb:
+
+ orq %r8,%r8
+ je .L_after_reduction_dGgFeCerpjagCtb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dGgFeCerpjagCtb:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_14_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_GjeuEqvcyhCdAlB
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_GjeuEqvcyhCdAlB
+
+.L_16_blocks_overflow_GjeuEqvcyhCdAlB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_GjeuEqvcyhCdAlB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CbnaspueplphnCn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CbnaspueplphnCn
+.L_small_initial_partial_block_CbnaspueplphnCn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CbnaspueplphnCn:
+
+ orq %r8,%r8
+ je .L_after_reduction_CbnaspueplphnCn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CbnaspueplphnCn:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_15_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_vduCxcjofxGqAou
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_vduCxcjofxGqAou
+
+.L_16_blocks_overflow_vduCxcjofxGqAou:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_vduCxcjofxGqAou:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xdoEhGjsfscahrp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xdoEhGjsfscahrp
+.L_small_initial_partial_block_xdoEhGjsfscahrp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xdoEhGjsfscahrp:
+
+ orq %r8,%r8
+ je .L_after_reduction_xdoEhGjsfscahrp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xdoEhGjsfscahrp:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_16_mzebEnFmrFgqunA:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_skEyjqiskGfxdvC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_skEyjqiskGfxdvC
+
+.L_16_blocks_overflow_skEyjqiskGfxdvC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_skEyjqiskGfxdvC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_dxixdfuDqivveAt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dxixdfuDqivveAt:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dxixdfuDqivveAt:
+ jmp .L_last_blocks_done_mzebEnFmrFgqunA
+.L_last_num_blocks_is_0_mzebEnFmrFgqunA:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_mzebEnFmrFgqunA:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_yiifChpfBbxhAhe
+.L_encrypt_16_blocks_yiifChpfBbxhAhe:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_lGoEsFGcBhBnEgo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_lGoEsFGcBhBnEgo
+.L_16_blocks_overflow_lGoEsFGcBhBnEgo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_lGoEsFGcBhBnEgo:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 256(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 320(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 384(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 448(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_GGlifssooGvFomC
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_GGlifssooGvFomC
+ jb .L_last_num_blocks_is_7_1_GGlifssooGvFomC
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_GGlifssooGvFomC
+ jb .L_last_num_blocks_is_11_9_GGlifssooGvFomC
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_GGlifssooGvFomC
+ ja .L_last_num_blocks_is_16_GGlifssooGvFomC
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_GGlifssooGvFomC
+ jmp .L_last_num_blocks_is_13_GGlifssooGvFomC
+
+.L_last_num_blocks_is_11_9_GGlifssooGvFomC:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_GGlifssooGvFomC
+ ja .L_last_num_blocks_is_11_GGlifssooGvFomC
+ jmp .L_last_num_blocks_is_9_GGlifssooGvFomC
+
+.L_last_num_blocks_is_7_1_GGlifssooGvFomC:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_GGlifssooGvFomC
+ jb .L_last_num_blocks_is_3_1_GGlifssooGvFomC
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_GGlifssooGvFomC
+ je .L_last_num_blocks_is_6_GGlifssooGvFomC
+ jmp .L_last_num_blocks_is_5_GGlifssooGvFomC
+
+.L_last_num_blocks_is_3_1_GGlifssooGvFomC:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_GGlifssooGvFomC
+ je .L_last_num_blocks_is_2_GGlifssooGvFomC
+.L_last_num_blocks_is_1_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_DFdkfCEpyEuzGts
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_DFdkfCEpyEuzGts
+
+.L_16_blocks_overflow_DFdkfCEpyEuzGts:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_DFdkfCEpyEuzGts:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %xmm31,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_znzDmxCrzeqhmtt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_znzDmxCrzeqhmtt
+.L_small_initial_partial_block_znzDmxCrzeqhmtt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_znzDmxCrzeqhmtt
+.L_small_initial_compute_done_znzDmxCrzeqhmtt:
+.L_after_reduction_znzDmxCrzeqhmtt:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_2_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_fxAkfvCdnqqGArm
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_fxAkfvCdnqqGArm
+
+.L_16_blocks_overflow_fxAkfvCdnqqGArm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_fxAkfvCdnqqGArm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %ymm31,%ymm0,%ymm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kgAaABygmxmrDhD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kgAaABygmxmrDhD
+.L_small_initial_partial_block_kgAaABygmxmrDhD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kgAaABygmxmrDhD:
+
+ orq %r8,%r8
+ je .L_after_reduction_kgAaABygmxmrDhD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_kgAaABygmxmrDhD:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_3_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_DnqopufcDlfooBF
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_DnqopufcDlfooBF
+
+.L_16_blocks_overflow_DnqopufcDlfooBF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_DnqopufcDlfooBF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qgbxmvAdpcwjFGD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qgbxmvAdpcwjFGD
+.L_small_initial_partial_block_qgbxmvAdpcwjFGD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qgbxmvAdpcwjFGD:
+
+ orq %r8,%r8
+ je .L_after_reduction_qgbxmvAdpcwjFGD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qgbxmvAdpcwjFGD:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_4_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_zzorvqhpvdBckcq
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_zzorvqhpvdBckcq
+
+.L_16_blocks_overflow_zzorvqhpvdBckcq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_zzorvqhpvdBckcq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_giCxqwgmxrChxdc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_giCxqwgmxrChxdc
+.L_small_initial_partial_block_giCxqwgmxrChxdc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_giCxqwgmxrChxdc:
+
+ orq %r8,%r8
+ je .L_after_reduction_giCxqwgmxrChxdc
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_giCxqwgmxrChxdc:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_5_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_qzjnvgqjjxsfmEr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_qzjnvgqjjxsfmEr
+
+.L_16_blocks_overflow_qzjnvgqjjxsfmEr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_qzjnvgqjjxsfmEr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xoEftvygjvpovck
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xoEftvygjvpovck
+.L_small_initial_partial_block_xoEftvygjvpovck:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xoEftvygjvpovck:
+
+ orq %r8,%r8
+ je .L_after_reduction_xoEftvygjvpovck
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xoEftvygjvpovck:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_6_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_mvFwizCezuedAbr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_mvFwizCezuedAbr
+
+.L_16_blocks_overflow_mvFwizCezuedAbr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_mvFwizCezuedAbr:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FDuhyDmhetmzsvq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FDuhyDmhetmzsvq
+.L_small_initial_partial_block_FDuhyDmhetmzsvq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FDuhyDmhetmzsvq:
+
+ orq %r8,%r8
+ je .L_after_reduction_FDuhyDmhetmzsvq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FDuhyDmhetmzsvq:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_7_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_owtBaGpzgzgcxrC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_owtBaGpzgzgcxrC
+
+.L_16_blocks_overflow_owtBaGpzgzgcxrC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_owtBaGpzgzgcxrC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DncaxytjCyxiknt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DncaxytjCyxiknt
+.L_small_initial_partial_block_DncaxytjCyxiknt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DncaxytjCyxiknt:
+
+ orq %r8,%r8
+ je .L_after_reduction_DncaxytjCyxiknt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DncaxytjCyxiknt:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_8_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_dAhdphrDhhiFfvd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_dAhdphrDhhiFfvd
+
+.L_16_blocks_overflow_dAhdphrDhhiFfvd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_dAhdphrDhhiFfvd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CnEvizjBlzFFnif
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CnEvizjBlzFFnif
+.L_small_initial_partial_block_CnEvizjBlzFFnif:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CnEvizjBlzFFnif:
+
+ orq %r8,%r8
+ je .L_after_reduction_CnEvizjBlzFFnif
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CnEvizjBlzFFnif:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_9_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_eaicByEvunpebxo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_eaicByEvunpebxo
+
+.L_16_blocks_overflow_eaicByEvunpebxo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_eaicByEvunpebxo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gfgCplcDGBrovbz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gfgCplcDGBrovbz
+.L_small_initial_partial_block_gfgCplcDGBrovbz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gfgCplcDGBrovbz:
+
+ orq %r8,%r8
+ je .L_after_reduction_gfgCplcDGBrovbz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gfgCplcDGBrovbz:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_10_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_bfFejorcehrytqq
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_bfFejorcehrytqq
+
+.L_16_blocks_overflow_bfFejorcehrytqq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_bfFejorcehrytqq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ebiAndfrelejgeD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ebiAndfrelejgeD
+.L_small_initial_partial_block_ebiAndfrelejgeD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ebiAndfrelejgeD:
+
+ orq %r8,%r8
+ je .L_after_reduction_ebiAndfrelejgeD
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ebiAndfrelejgeD:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_11_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_nsakvpcBnizduGq
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_nsakvpcBnizduGq
+
+.L_16_blocks_overflow_nsakvpcBnizduGq:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_nsakvpcBnizduGq:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FeAoudrbheqBGiy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FeAoudrbheqBGiy
+.L_small_initial_partial_block_FeAoudrbheqBGiy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FeAoudrbheqBGiy:
+
+ orq %r8,%r8
+ je .L_after_reduction_FeAoudrbheqBGiy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FeAoudrbheqBGiy:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_12_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_bwFzciofFgjcilw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_bwFzciofFgjcilw
+
+.L_16_blocks_overflow_bwFzciofFgjcilw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_bwFzciofFgjcilw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_cfkroClFdpzvhum
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_cfkroClFdpzvhum
+.L_small_initial_partial_block_cfkroClFdpzvhum:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_cfkroClFdpzvhum:
+
+ orq %r8,%r8
+ je .L_after_reduction_cfkroClFdpzvhum
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_cfkroClFdpzvhum:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_13_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_wabAfqhkitemmDb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_wabAfqhkitemmDb
+
+.L_16_blocks_overflow_wabAfqhkitemmDb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_wabAfqhkitemmDb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_sdmohCiFjxvtkha
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_sdmohCiFjxvtkha
+.L_small_initial_partial_block_sdmohCiFjxvtkha:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_sdmohCiFjxvtkha:
+
+ orq %r8,%r8
+ je .L_after_reduction_sdmohCiFjxvtkha
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_sdmohCiFjxvtkha:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_14_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_xpqoqezlFcomfjA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_xpqoqezlFcomfjA
+
+.L_16_blocks_overflow_xpqoqezlFcomfjA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_xpqoqezlFcomfjA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fexjdoDflollEzw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fexjdoDflollEzw
+.L_small_initial_partial_block_fexjdoDflollEzw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fexjdoDflollEzw:
+
+ orq %r8,%r8
+ je .L_after_reduction_fexjdoDflollEzw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fexjdoDflollEzw:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_15_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_iupvxgCFjryaArw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_iupvxgCFjryaArw
+
+.L_16_blocks_overflow_iupvxgCFjryaArw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_iupvxgCFjryaArw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_lxborjzgtwFghrg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_lxborjzgtwFghrg
+.L_small_initial_partial_block_lxborjzgtwFghrg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lxborjzgtwFghrg:
+
+ orq %r8,%r8
+ je .L_after_reduction_lxborjzgtwFghrg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lxborjzgtwFghrg:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_16_GGlifssooGvFomC:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_moDvkAftCFCxmvo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_moDvkAftCFCxmvo
+
+.L_16_blocks_overflow_moDvkAftCFCxmvo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_moDvkAftCFCxmvo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_xrrskpkhizncrkw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xrrskpkhizncrkw:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xrrskpkhizncrkw:
+ jmp .L_last_blocks_done_GGlifssooGvFomC
+.L_last_num_blocks_is_0_GGlifssooGvFomC:
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_GGlifssooGvFomC:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_yiifChpfBbxhAhe
+
+.L_message_below_32_blocks_yiifChpfBbxhAhe:
+
+
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_ixpbnbdqqmnximo
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+.L_skip_hkeys_precomputation_ixpbnbdqqmnximo:
+ movq $1,%r14
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_GEDbrBwahgCtBua
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_GEDbrBwahgCtBua
+ jb .L_last_num_blocks_is_7_1_GEDbrBwahgCtBua
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_GEDbrBwahgCtBua
+ jb .L_last_num_blocks_is_11_9_GEDbrBwahgCtBua
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_GEDbrBwahgCtBua
+ ja .L_last_num_blocks_is_16_GEDbrBwahgCtBua
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_GEDbrBwahgCtBua
+ jmp .L_last_num_blocks_is_13_GEDbrBwahgCtBua
+
+.L_last_num_blocks_is_11_9_GEDbrBwahgCtBua:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_GEDbrBwahgCtBua
+ ja .L_last_num_blocks_is_11_GEDbrBwahgCtBua
+ jmp .L_last_num_blocks_is_9_GEDbrBwahgCtBua
+
+.L_last_num_blocks_is_7_1_GEDbrBwahgCtBua:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_GEDbrBwahgCtBua
+ jb .L_last_num_blocks_is_3_1_GEDbrBwahgCtBua
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_GEDbrBwahgCtBua
+ je .L_last_num_blocks_is_6_GEDbrBwahgCtBua
+ jmp .L_last_num_blocks_is_5_GEDbrBwahgCtBua
+
+.L_last_num_blocks_is_3_1_GEDbrBwahgCtBua:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_GEDbrBwahgCtBua
+ je .L_last_num_blocks_is_2_GEDbrBwahgCtBua
+.L_last_num_blocks_is_1_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_uopvqADFnvomDpc
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_uopvqADFnvomDpc
+
+.L_16_blocks_overflow_uopvqADFnvomDpc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_uopvqADFnvomDpc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DnfzexoyiBDakur
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DnfzexoyiBDakur
+.L_small_initial_partial_block_DnfzexoyiBDakur:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_DnfzexoyiBDakur
+.L_small_initial_compute_done_DnfzexoyiBDakur:
+.L_after_reduction_DnfzexoyiBDakur:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_2_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_frftcwjeGlwitcu
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_frftcwjeGlwitcu
+
+.L_16_blocks_overflow_frftcwjeGlwitcu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_frftcwjeGlwitcu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ebldtywbExmpuki
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ebldtywbExmpuki
+.L_small_initial_partial_block_ebldtywbExmpuki:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ebldtywbExmpuki:
+
+ orq %r8,%r8
+ je .L_after_reduction_ebldtywbExmpuki
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ebldtywbExmpuki:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_3_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_hAiudycBxwjzccs
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_hAiudycBxwjzccs
+
+.L_16_blocks_overflow_hAiudycBxwjzccs:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_hAiudycBxwjzccs:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gkjuFBcoGtpvwjC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gkjuFBcoGtpvwjC
+.L_small_initial_partial_block_gkjuFBcoGtpvwjC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gkjuFBcoGtpvwjC:
+
+ orq %r8,%r8
+ je .L_after_reduction_gkjuFBcoGtpvwjC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gkjuFBcoGtpvwjC:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_4_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_oahqGxwjdGuFmgl
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_oahqGxwjdGuFmgl
+
+.L_16_blocks_overflow_oahqGxwjdGuFmgl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_oahqGxwjdGuFmgl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_eiywasarDnqsmGr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_eiywasarDnqsmGr
+.L_small_initial_partial_block_eiywasarDnqsmGr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_eiywasarDnqsmGr:
+
+ orq %r8,%r8
+ je .L_after_reduction_eiywasarDnqsmGr
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_eiywasarDnqsmGr:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_5_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_hnCCvmCdnDGyqwm
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_hnCCvmCdnDGyqwm
+
+.L_16_blocks_overflow_hnCCvmCdnDGyqwm:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_hnCCvmCdnDGyqwm:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ClsDvmjDyaivejA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ClsDvmjDyaivejA
+.L_small_initial_partial_block_ClsDvmjDyaivejA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ClsDvmjDyaivejA:
+
+ orq %r8,%r8
+ je .L_after_reduction_ClsDvmjDyaivejA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ClsDvmjDyaivejA:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_6_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_wuftgpncuosGzzy
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_wuftgpncuosGzzy
+
+.L_16_blocks_overflow_wuftgpncuosGzzy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_wuftgpncuosGzzy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zFcpqFaCfaxEfGi
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zFcpqFaCfaxEfGi
+.L_small_initial_partial_block_zFcpqFaCfaxEfGi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zFcpqFaCfaxEfGi:
+
+ orq %r8,%r8
+ je .L_after_reduction_zFcpqFaCfaxEfGi
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zFcpqFaCfaxEfGi:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_7_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_klwFEoGBGuBizdw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_klwFEoGBGuBizdw
+
+.L_16_blocks_overflow_klwFEoGBGuBizdw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_klwFEoGBGuBizdw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xbzdhFqEauEAyBq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xbzdhFqEauEAyBq
+.L_small_initial_partial_block_xbzdhFqEauEAyBq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xbzdhFqEauEAyBq:
+
+ orq %r8,%r8
+ je .L_after_reduction_xbzdhFqEauEAyBq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xbzdhFqEauEAyBq:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_8_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_jAucrepCBmxevpC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_jAucrepCBmxevpC
+
+.L_16_blocks_overflow_jAucrepCBmxevpC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_jAucrepCBmxevpC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xBnzffrFrcfhxcA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xBnzffrFrcfhxcA
+.L_small_initial_partial_block_xBnzffrFrcfhxcA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xBnzffrFrcfhxcA:
+
+ orq %r8,%r8
+ je .L_after_reduction_xBnzffrFrcfhxcA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xBnzffrFrcfhxcA:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_9_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_lnAxGywxkpnspqj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_lnAxGywxkpnspqj
+
+.L_16_blocks_overflow_lnAxGywxkpnspqj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_lnAxGywxkpnspqj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AFvqyugwjoGBwEa
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AFvqyugwjoGBwEa
+.L_small_initial_partial_block_AFvqyugwjoGBwEa:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AFvqyugwjoGBwEa:
+
+ orq %r8,%r8
+ je .L_after_reduction_AFvqyugwjoGBwEa
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AFvqyugwjoGBwEa:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_10_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_ffDgumCtogFyFDv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_ffDgumCtogFyFDv
+
+.L_16_blocks_overflow_ffDgumCtogFyFDv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_ffDgumCtogFyFDv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_erArFgBvhusaEfz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_erArFgBvhusaEfz
+.L_small_initial_partial_block_erArFgBvhusaEfz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_erArFgBvhusaEfz:
+
+ orq %r8,%r8
+ je .L_after_reduction_erArFgBvhusaEfz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_erArFgBvhusaEfz:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_11_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_bFwwBhxumkFGgCj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_bFwwBhxumkFGgCj
+
+.L_16_blocks_overflow_bFwwBhxumkFGgCj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_bFwwBhxumkFGgCj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GsrdkhxzEjDjspu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GsrdkhxzEjDjspu
+.L_small_initial_partial_block_GsrdkhxzEjDjspu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GsrdkhxzEjDjspu:
+
+ orq %r8,%r8
+ je .L_after_reduction_GsrdkhxzEjDjspu
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_GsrdkhxzEjDjspu:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_12_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_EhylpkcoptuvDCF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_EhylpkcoptuvDCF
+
+.L_16_blocks_overflow_EhylpkcoptuvDCF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_EhylpkcoptuvDCF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rxjldaleyvljAtn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rxjldaleyvljAtn
+.L_small_initial_partial_block_rxjldaleyvljAtn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rxjldaleyvljAtn:
+
+ orq %r8,%r8
+ je .L_after_reduction_rxjldaleyvljAtn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rxjldaleyvljAtn:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_13_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_fbDDAjuqhDzbgcz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_fbDDAjuqhDzbgcz
+
+.L_16_blocks_overflow_fbDDAjuqhDzbgcz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_fbDDAjuqhDzbgcz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rvBgbcAEiGvppxE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rvBgbcAEiGvppxE
+.L_small_initial_partial_block_rvBgbcAEiGvppxE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rvBgbcAEiGvppxE:
+
+ orq %r8,%r8
+ je .L_after_reduction_rvBgbcAEiGvppxE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rvBgbcAEiGvppxE:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_14_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_gqnBxnvCCiecpBb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_gqnBxnvCCiecpBb
+
+.L_16_blocks_overflow_gqnBxnvCCiecpBb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_gqnBxnvCCiecpBb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_eqvhEpqoCboGBGs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_eqvhEpqoCboGBGs
+.L_small_initial_partial_block_eqvhEpqoCboGBGs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_eqvhEpqoCboGBGs:
+
+ orq %r8,%r8
+ je .L_after_reduction_eqvhEpqoCboGBGs
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_eqvhEpqoCboGBGs:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_15_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_dnxqlgAbmkEzAAl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_dnxqlgAbmkEzAAl
+
+.L_16_blocks_overflow_dnxqlgAbmkEzAAl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_dnxqlgAbmkEzAAl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vubecvzrvvmvkjn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vubecvzrvvmvkjn
+.L_small_initial_partial_block_vubecvzrvvmvkjn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vubecvzrvvmvkjn:
+
+ orq %r8,%r8
+ je .L_after_reduction_vubecvzrvvmvkjn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vubecvzrvvmvkjn:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_16_GEDbrBwahgCtBua:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_CvkndtfiFrebkyC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_CvkndtfiFrebkyC
+
+.L_16_blocks_overflow_CvkndtfiFrebkyC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_CvkndtfiFrebkyC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_lvDgrdjdyCeaixF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_lvDgrdjdyCeaixF:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_lvDgrdjdyCeaixF:
+ jmp .L_last_blocks_done_GEDbrBwahgCtBua
+.L_last_num_blocks_is_0_GEDbrBwahgCtBua:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_GEDbrBwahgCtBua:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_yiifChpfBbxhAhe
+
+.L_message_below_equal_16_blocks_yiifChpfBbxhAhe:
+
+
+ movl %r8d,%r12d
+ addl $15,%r12d
+ shrl $4,%r12d
+ cmpq $8,%r12
+ je .L_small_initial_num_blocks_is_8_mplqBbEupjaGmpE
+ jl .L_small_initial_num_blocks_is_7_1_mplqBbEupjaGmpE
+
+
+ cmpq $12,%r12
+ je .L_small_initial_num_blocks_is_12_mplqBbEupjaGmpE
+ jl .L_small_initial_num_blocks_is_11_9_mplqBbEupjaGmpE
+
+
+ cmpq $16,%r12
+ je .L_small_initial_num_blocks_is_16_mplqBbEupjaGmpE
+ cmpq $15,%r12
+ je .L_small_initial_num_blocks_is_15_mplqBbEupjaGmpE
+ cmpq $14,%r12
+ je .L_small_initial_num_blocks_is_14_mplqBbEupjaGmpE
+ jmp .L_small_initial_num_blocks_is_13_mplqBbEupjaGmpE
+
+.L_small_initial_num_blocks_is_11_9_mplqBbEupjaGmpE:
+
+ cmpq $11,%r12
+ je .L_small_initial_num_blocks_is_11_mplqBbEupjaGmpE
+ cmpq $10,%r12
+ je .L_small_initial_num_blocks_is_10_mplqBbEupjaGmpE
+ jmp .L_small_initial_num_blocks_is_9_mplqBbEupjaGmpE
+
+.L_small_initial_num_blocks_is_7_1_mplqBbEupjaGmpE:
+ cmpq $4,%r12
+ je .L_small_initial_num_blocks_is_4_mplqBbEupjaGmpE
+ jl .L_small_initial_num_blocks_is_3_1_mplqBbEupjaGmpE
+
+ cmpq $7,%r12
+ je .L_small_initial_num_blocks_is_7_mplqBbEupjaGmpE
+ cmpq $6,%r12
+ je .L_small_initial_num_blocks_is_6_mplqBbEupjaGmpE
+ jmp .L_small_initial_num_blocks_is_5_mplqBbEupjaGmpE
+
+.L_small_initial_num_blocks_is_3_1_mplqBbEupjaGmpE:
+
+ cmpq $3,%r12
+ je .L_small_initial_num_blocks_is_3_mplqBbEupjaGmpE
+ cmpq $2,%r12
+ je .L_small_initial_num_blocks_is_2_mplqBbEupjaGmpE
+
+
+
+
+
+.L_small_initial_num_blocks_is_1_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%xmm29
+ vpaddd ONE(%rip),%xmm2,%xmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vpshufb %xmm29,%xmm0,%xmm0
+ vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %xmm15,%xmm0,%xmm0
+ vpxorq %xmm6,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm6,%xmm6
+ vextracti32x4 $0,%zmm6,%xmm13
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nsFdAskshxaeupv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nsFdAskshxaeupv
+.L_small_initial_partial_block_nsFdAskshxaeupv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm13,%xmm14,%xmm14
+
+ jmp .L_after_reduction_nsFdAskshxaeupv
+.L_small_initial_compute_done_nsFdAskshxaeupv:
+.L_after_reduction_nsFdAskshxaeupv:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_2_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%ymm29
+ vshufi64x2 $0,%ymm2,%ymm2,%ymm0
+ vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vpshufb %ymm29,%ymm0,%ymm0
+ vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %ymm15,%ymm0,%ymm0
+ vpxorq %ymm6,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm6,%ymm6
+ vextracti32x4 $1,%zmm6,%xmm13
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fCBepgtpwtinebu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fCBepgtpwtinebu
+.L_small_initial_partial_block_fCBepgtpwtinebu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fCBepgtpwtinebu:
+
+ orq %r8,%r8
+ je .L_after_reduction_fCBepgtpwtinebu
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_fCBepgtpwtinebu:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_3_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vextracti32x4 $2,%zmm6,%xmm13
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ofgdrgACzgoBoBr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ofgdrgACzgoBoBr
+.L_small_initial_partial_block_ofgdrgACzgoBoBr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ofgdrgACzgoBoBr:
+
+ orq %r8,%r8
+ je .L_after_reduction_ofgdrgACzgoBoBr
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ofgdrgACzgoBoBr:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_4_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vextracti32x4 $3,%zmm6,%xmm13
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dEtigFagnjrsGpg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dEtigFagnjrsGpg
+.L_small_initial_partial_block_dEtigFagnjrsGpg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dEtigFagnjrsGpg:
+
+ orq %r8,%r8
+ je .L_after_reduction_dEtigFagnjrsGpg
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_dEtigFagnjrsGpg:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_5_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %xmm7,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %xmm29,%xmm7,%xmm7
+ vextracti32x4 $0,%zmm7,%xmm13
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dCteGnCoiDfemGr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dCteGnCoiDfemGr
+.L_small_initial_partial_block_dCteGnCoiDfemGr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dCteGnCoiDfemGr:
+
+ orq %r8,%r8
+ je .L_after_reduction_dCteGnCoiDfemGr
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_dCteGnCoiDfemGr:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_6_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %ymm15,%ymm3,%ymm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %ymm7,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %ymm29,%ymm7,%ymm7
+ vextracti32x4 $1,%zmm7,%xmm13
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bGkgeCcdmBAvnkd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bGkgeCcdmBAvnkd
+.L_small_initial_partial_block_bGkgeCcdmBAvnkd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bGkgeCcdmBAvnkd:
+
+ orq %r8,%r8
+ je .L_after_reduction_bGkgeCcdmBAvnkd
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_bGkgeCcdmBAvnkd:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_7_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vextracti32x4 $2,%zmm7,%xmm13
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yFpypBfpEqGmDpc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yFpypBfpEqGmDpc
+.L_small_initial_partial_block_yFpypBfpEqGmDpc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yFpypBfpEqGmDpc:
+
+ orq %r8,%r8
+ je .L_after_reduction_yFpypBfpEqGmDpc
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_yFpypBfpEqGmDpc:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_8_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vextracti32x4 $3,%zmm7,%xmm13
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hjijhggGtBGkmFD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hjijhggGtBGkmFD
+.L_small_initial_partial_block_hjijhggGtBGkmFD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hjijhggGtBGkmFD:
+
+ orq %r8,%r8
+ je .L_after_reduction_hjijhggGtBGkmFD
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_hjijhggGtBGkmFD:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_9_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %xmm10,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %xmm29,%xmm10,%xmm10
+ vextracti32x4 $0,%zmm10,%xmm13
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rEnEygbAhbwkuDv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rEnEygbAhbwkuDv
+.L_small_initial_partial_block_rEnEygbAhbwkuDv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rEnEygbAhbwkuDv:
+
+ orq %r8,%r8
+ je .L_after_reduction_rEnEygbAhbwkuDv
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_rEnEygbAhbwkuDv:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_10_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %ymm15,%ymm4,%ymm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %ymm10,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %ymm29,%ymm10,%ymm10
+ vextracti32x4 $1,%zmm10,%xmm13
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ycofttvCgGxDvfA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ycofttvCgGxDvfA
+.L_small_initial_partial_block_ycofttvCgGxDvfA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ycofttvCgGxDvfA:
+
+ orq %r8,%r8
+ je .L_after_reduction_ycofttvCgGxDvfA
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ycofttvCgGxDvfA:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_11_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vextracti32x4 $2,%zmm10,%xmm13
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ltkvxnnCtyaDcot
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ltkvxnnCtyaDcot
+.L_small_initial_partial_block_ltkvxnnCtyaDcot:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ltkvxnnCtyaDcot:
+
+ orq %r8,%r8
+ je .L_after_reduction_ltkvxnnCtyaDcot
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ltkvxnnCtyaDcot:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_12_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vextracti32x4 $3,%zmm10,%xmm13
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zoBxutsDfgEkfdl
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zoBxutsDfgEkfdl
+.L_small_initial_partial_block_zoBxutsDfgEkfdl:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zoBxutsDfgEkfdl:
+
+ orq %r8,%r8
+ je .L_after_reduction_zoBxutsDfgEkfdl
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_zoBxutsDfgEkfdl:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_13_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %xmm11,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %xmm29,%xmm11,%xmm11
+ vextracti32x4 $0,%zmm11,%xmm13
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fgsEocrdhfxmzmp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fgsEocrdhfxmzmp
+.L_small_initial_partial_block_fgsEocrdhfxmzmp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fgsEocrdhfxmzmp:
+
+ orq %r8,%r8
+ je .L_after_reduction_fgsEocrdhfxmzmp
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_fgsEocrdhfxmzmp:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_14_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %ymm15,%ymm5,%ymm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %ymm11,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %ymm29,%ymm11,%ymm11
+ vextracti32x4 $1,%zmm11,%xmm13
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aBllprqbyydDmyj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aBllprqbyydDmyj
+.L_small_initial_partial_block_aBllprqbyydDmyj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_aBllprqbyydDmyj:
+
+ orq %r8,%r8
+ je .L_after_reduction_aBllprqbyydDmyj
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_aBllprqbyydDmyj:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_15_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vextracti32x4 $2,%zmm11,%xmm13
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AexewybgiAbCusw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AexewybgiAbCusw
+.L_small_initial_partial_block_AexewybgiAbCusw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AexewybgiAbCusw:
+
+ orq %r8,%r8
+ je .L_after_reduction_AexewybgiAbCusw
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_AexewybgiAbCusw:
+ jmp .L_small_initial_blocks_encrypted_mplqBbEupjaGmpE
+.L_small_initial_num_blocks_is_16_mplqBbEupjaGmpE:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vextracti32x4 $3,%zmm11,%xmm13
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_wjciopnfEgwwghE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wjciopnfEgwwghE:
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_wjciopnfEgwwghE:
+.L_small_initial_blocks_encrypted_mplqBbEupjaGmpE:
+.L_ghash_done_yiifChpfBbxhAhe:
+ vmovdqu64 %xmm2,0(%rsi)
+ vmovdqu64 %xmm14,64(%rsi)
+.L_enc_dec_done_yiifChpfBbxhAhe:
+ jmp .Lexit_gcm_decrypt
+.align 32
+.Laes_gcm_decrypt_256_avx512:
+ orq %r8,%r8
+ je .L_enc_dec_done_kgypzeldFqsBnqw
+ xorq %r14,%r14
+ vmovdqu64 64(%rsi),%xmm14
+
+ movq (%rdx),%r11
+ orq %r11,%r11
+ je .L_partial_block_done_nggFpEjksmvdyrl
+ movl $16,%r10d
+ leaq byte_len_to_mask_table(%rip),%r12
+ cmpq %r10,%r8
+ cmovcq %r8,%r10
+ kmovw (%r12,%r10,2),%k1
+ vmovdqu8 (%rcx),%xmm0{%k1}{z}
+
+ vmovdqu64 16(%rsi),%xmm3
+ vmovdqu64 336(%rsi),%xmm4
+
+
+
+ leaq SHIFT_MASK(%rip),%r12
+ addq %r11,%r12
+ vmovdqu64 (%r12),%xmm5
+ vpshufb %xmm5,%xmm3,%xmm3
+
+ vmovdqa64 %xmm0,%xmm6
+ vpxorq %xmm0,%xmm3,%xmm3
+
+
+ leaq (%r8,%r11,1),%r13
+ subq $16,%r13
+ jge .L_no_extra_mask_nggFpEjksmvdyrl
+ subq %r13,%r12
+.L_no_extra_mask_nggFpEjksmvdyrl:
+
+
+
+ vmovdqu64 16(%r12),%xmm0
+ vpand %xmm0,%xmm3,%xmm3
+ vpand %xmm0,%xmm6,%xmm6
+ vpshufb SHUF_MASK(%rip),%xmm6,%xmm6
+ vpshufb %xmm5,%xmm6,%xmm6
+ vpxorq %xmm6,%xmm14,%xmm14
+ cmpq $0,%r13
+ jl .L_partial_incomplete_nggFpEjksmvdyrl
+
+ vpclmulqdq $0x11,%xmm4,%xmm14,%xmm7
+ vpclmulqdq $0x00,%xmm4,%xmm14,%xmm10
+ vpclmulqdq $0x01,%xmm4,%xmm14,%xmm11
+ vpclmulqdq $0x10,%xmm4,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm14,%xmm14
+
+ vpsrldq $8,%xmm14,%xmm11
+ vpslldq $8,%xmm14,%xmm14
+ vpxorq %xmm11,%xmm7,%xmm7
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm11
+
+ vpclmulqdq $0x01,%xmm14,%xmm11,%xmm10
+ vpslldq $8,%xmm10,%xmm10
+ vpxorq %xmm10,%xmm14,%xmm14
+
+
+
+ vpclmulqdq $0x00,%xmm14,%xmm11,%xmm10
+ vpsrldq $4,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm14,%xmm11,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+
+ vpternlogq $0x96,%xmm10,%xmm7,%xmm14
+
+ movq $0,(%rdx)
+
+ movq %r11,%r12
+ movq $16,%r11
+ subq %r12,%r11
+ jmp .L_enc_dec_done_nggFpEjksmvdyrl
+
+.L_partial_incomplete_nggFpEjksmvdyrl:
+ addq %r8,(%rdx)
+ movq %r8,%r11
+
+.L_enc_dec_done_nggFpEjksmvdyrl:
+
+
+ leaq byte_len_to_mask_table(%rip),%r12
+ kmovw (%r12,%r11,2),%k1
+ vmovdqu64 %xmm14,64(%rsi)
+ movq %r9,%r12
+ vmovdqu8 %xmm3,(%r12){%k1}
+.L_partial_block_done_nggFpEjksmvdyrl:
+ vmovdqu64 0(%rsi),%xmm2
+ subq %r11,%r8
+ je .L_enc_dec_done_kgypzeldFqsBnqw
+ cmpq $256,%r8
+ jbe .L_message_below_equal_16_blocks_kgypzeldFqsBnqw
+
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vmovdqa64 ddq_addbe_4444(%rip),%zmm27
+ vmovdqa64 ddq_addbe_1234(%rip),%zmm28
+
+
+
+
+
+
+ vmovd %xmm2,%r15d
+ andl $255,%r15d
+
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpshufb %zmm29,%zmm2,%zmm2
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_tAigrohrtcimtjt
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_tAigrohrtcimtjt
+.L_next_16_overflow_tAigrohrtcimtjt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_tAigrohrtcimtjt:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm0
+ vmovdqu8 64(%rcx,%r11,1),%zmm3
+ vmovdqu8 128(%rcx,%r11,1),%zmm4
+ vmovdqu8 192(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 208(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 224(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,0(%r10,%r11,1)
+ vmovdqu8 %zmm10,64(%r10,%r11,1)
+ vmovdqu8 %zmm11,128(%r10,%r11,1)
+ vmovdqu8 %zmm12,192(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm0,%zmm7
+ vpshufb %zmm29,%zmm3,%zmm10
+ vpshufb %zmm29,%zmm4,%zmm11
+ vpshufb %zmm29,%zmm5,%zmm12
+ vmovdqa64 %zmm7,768(%rsp)
+ vmovdqa64 %zmm10,832(%rsp)
+ vmovdqa64 %zmm11,896(%rsp)
+ vmovdqa64 %zmm12,960(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_ghxCyjhEqsFobgk
+
+ vmovdqu64 288(%rsi),%zmm0
+ vmovdqu64 %zmm0,704(%rsp)
+
+ vmovdqu64 224(%rsi),%zmm3
+ vmovdqu64 %zmm3,640(%rsp)
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 160(%rsi),%zmm4
+ vmovdqu64 %zmm4,576(%rsp)
+
+ vmovdqu64 96(%rsi),%zmm5
+ vmovdqu64 %zmm5,512(%rsp)
+.L_skip_hkeys_precomputation_ghxCyjhEqsFobgk:
+ cmpq $512,%r8
+ jb .L_message_below_32_blocks_kgypzeldFqsBnqw
+
+
+
+ cmpb $240,%r15b
+ jae .L_next_16_overflow_ChqoygvwrfptFdk
+ vpaddd %zmm28,%zmm2,%zmm7
+ vpaddd %zmm27,%zmm7,%zmm10
+ vpaddd %zmm27,%zmm10,%zmm11
+ vpaddd %zmm27,%zmm11,%zmm12
+ jmp .L_next_16_ok_ChqoygvwrfptFdk
+.L_next_16_overflow_ChqoygvwrfptFdk:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm12
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm7
+ vpaddd %zmm12,%zmm7,%zmm10
+ vpaddd %zmm12,%zmm10,%zmm11
+ vpaddd %zmm12,%zmm11,%zmm12
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vpshufb %zmm29,%zmm12,%zmm12
+.L_next_16_ok_ChqoygvwrfptFdk:
+ vshufi64x2 $255,%zmm12,%zmm12,%zmm2
+ addb $16,%r15b
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm0
+ vmovdqu8 320(%rcx,%r11,1),%zmm3
+ vmovdqu8 384(%rcx,%r11,1),%zmm4
+ vmovdqu8 448(%rcx,%r11,1),%zmm5
+
+
+ vbroadcastf64x2 0(%rdi),%zmm6
+ vpxorq %zmm6,%zmm7,%zmm7
+ vpxorq %zmm6,%zmm10,%zmm10
+ vpxorq %zmm6,%zmm11,%zmm11
+ vpxorq %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 16(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 32(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 48(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 64(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 80(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 96(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 112(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 128(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 144(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 160(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 176(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 192(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 208(%rdi),%zmm6
+ vaesenc %zmm6,%zmm7,%zmm7
+ vaesenc %zmm6,%zmm10,%zmm10
+ vaesenc %zmm6,%zmm11,%zmm11
+ vaesenc %zmm6,%zmm12,%zmm12
+ vbroadcastf64x2 224(%rdi),%zmm6
+ vaesenclast %zmm6,%zmm7,%zmm7
+ vaesenclast %zmm6,%zmm10,%zmm10
+ vaesenclast %zmm6,%zmm11,%zmm11
+ vaesenclast %zmm6,%zmm12,%zmm12
+
+
+ vpxorq %zmm0,%zmm7,%zmm7
+ vpxorq %zmm3,%zmm10,%zmm10
+ vpxorq %zmm4,%zmm11,%zmm11
+ vpxorq %zmm5,%zmm12,%zmm12
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm7,256(%r10,%r11,1)
+ vmovdqu8 %zmm10,320(%r10,%r11,1)
+ vmovdqu8 %zmm11,384(%r10,%r11,1)
+ vmovdqu8 %zmm12,448(%r10,%r11,1)
+
+ vpshufb %zmm29,%zmm0,%zmm7
+ vpshufb %zmm29,%zmm3,%zmm10
+ vpshufb %zmm29,%zmm4,%zmm11
+ vpshufb %zmm29,%zmm5,%zmm12
+ vmovdqa64 %zmm7,1024(%rsp)
+ vmovdqa64 %zmm10,1088(%rsp)
+ vmovdqa64 %zmm11,1152(%rsp)
+ vmovdqa64 %zmm12,1216(%rsp)
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_mmnytfEfrGqjjzv
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,192(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,128(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,64(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,0(%rsp)
+.L_skip_hkeys_precomputation_mmnytfEfrGqjjzv:
+ movq $1,%r14
+ addq $512,%r11
+ subq $512,%r8
+
+ cmpq $768,%r8
+ jb .L_no_more_big_nblocks_kgypzeldFqsBnqw
+.L_encrypt_big_nblocks_kgypzeldFqsBnqw:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_eCBAbsCxcdjldmp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_eCBAbsCxcdjldmp
+.L_16_blocks_overflow_eCBAbsCxcdjldmp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_eCBAbsCxcdjldmp:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_vakicEdockyEGlr
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_vakicEdockyEGlr
+.L_16_blocks_overflow_vakicEdockyEGlr:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_vakicEdockyEGlr:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_DpGlguFoEuofxlo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_DpGlguFoEuofxlo
+.L_16_blocks_overflow_DpGlguFoEuofxlo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_DpGlguFoEuofxlo:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 512(%rcx,%r11,1),%zmm17
+ vmovdqu8 576(%rcx,%r11,1),%zmm19
+ vmovdqu8 640(%rcx,%r11,1),%zmm20
+ vmovdqu8 704(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+
+
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpternlogq $0x96,%zmm15,%zmm12,%zmm6
+ vpxorq %zmm24,%zmm6,%zmm6
+ vpternlogq $0x96,%zmm10,%zmm13,%zmm7
+ vpxorq %zmm25,%zmm7,%zmm7
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vextracti64x4 $1,%zmm6,%ymm12
+ vpxorq %ymm12,%ymm6,%ymm6
+ vextracti32x4 $1,%ymm6,%xmm12
+ vpxorq %xmm12,%xmm6,%xmm6
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm6
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,512(%r10,%r11,1)
+ vmovdqu8 %zmm3,576(%r10,%r11,1)
+ vmovdqu8 %zmm4,640(%r10,%r11,1)
+ vmovdqu8 %zmm5,704(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1024(%rsp)
+ vmovdqa64 %zmm3,1088(%rsp)
+ vmovdqa64 %zmm4,1152(%rsp)
+ vmovdqa64 %zmm5,1216(%rsp)
+ vmovdqa64 %zmm6,%zmm14
+
+ addq $768,%r11
+ subq $768,%r8
+ cmpq $768,%r8
+ jae .L_encrypt_big_nblocks_kgypzeldFqsBnqw
+
+.L_no_more_big_nblocks_kgypzeldFqsBnqw:
+
+ cmpq $512,%r8
+ jae .L_encrypt_32_blocks_kgypzeldFqsBnqw
+
+ cmpq $256,%r8
+ jae .L_encrypt_16_blocks_kgypzeldFqsBnqw
+.L_encrypt_0_blocks_ghash_32_kgypzeldFqsBnqw:
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $256,%ebx
+ subl %r10d,%ebx
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ addl $256,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_ClvEnqtsgcyzxra
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_ClvEnqtsgcyzxra
+ jb .L_last_num_blocks_is_7_1_ClvEnqtsgcyzxra
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_ClvEnqtsgcyzxra
+ jb .L_last_num_blocks_is_11_9_ClvEnqtsgcyzxra
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_ClvEnqtsgcyzxra
+ ja .L_last_num_blocks_is_16_ClvEnqtsgcyzxra
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_ClvEnqtsgcyzxra
+ jmp .L_last_num_blocks_is_13_ClvEnqtsgcyzxra
+
+.L_last_num_blocks_is_11_9_ClvEnqtsgcyzxra:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_ClvEnqtsgcyzxra
+ ja .L_last_num_blocks_is_11_ClvEnqtsgcyzxra
+ jmp .L_last_num_blocks_is_9_ClvEnqtsgcyzxra
+
+.L_last_num_blocks_is_7_1_ClvEnqtsgcyzxra:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_ClvEnqtsgcyzxra
+ jb .L_last_num_blocks_is_3_1_ClvEnqtsgcyzxra
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_ClvEnqtsgcyzxra
+ je .L_last_num_blocks_is_6_ClvEnqtsgcyzxra
+ jmp .L_last_num_blocks_is_5_ClvEnqtsgcyzxra
+
+.L_last_num_blocks_is_3_1_ClvEnqtsgcyzxra:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_ClvEnqtsgcyzxra
+ je .L_last_num_blocks_is_2_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_1_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_kfstzqbddCmrAgf
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_kfstzqbddCmrAgf
+
+.L_16_blocks_overflow_kfstzqbddCmrAgf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_kfstzqbddCmrAgf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tzfDxgvlfbGFphv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tzfDxgvlfbGFphv
+.L_small_initial_partial_block_tzfDxgvlfbGFphv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_tzfDxgvlfbGFphv
+.L_small_initial_compute_done_tzfDxgvlfbGFphv:
+.L_after_reduction_tzfDxgvlfbGFphv:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_2_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_rEDkqlsspBphEcE
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_rEDkqlsspBphEcE
+
+.L_16_blocks_overflow_rEDkqlsspBphEcE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_rEDkqlsspBphEcE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ctfxgFaGttixvxc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ctfxgFaGttixvxc
+.L_small_initial_partial_block_ctfxgFaGttixvxc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ctfxgFaGttixvxc:
+
+ orq %r8,%r8
+ je .L_after_reduction_ctfxgFaGttixvxc
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ctfxgFaGttixvxc:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_3_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_ghEEltEpFsCnyoi
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_ghEEltEpFsCnyoi
+
+.L_16_blocks_overflow_ghEEltEpFsCnyoi:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_ghEEltEpFsCnyoi:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pdGCGzyrnusufbk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pdGCGzyrnusufbk
+.L_small_initial_partial_block_pdGCGzyrnusufbk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pdGCGzyrnusufbk:
+
+ orq %r8,%r8
+ je .L_after_reduction_pdGCGzyrnusufbk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pdGCGzyrnusufbk:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_4_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_vrGynyzBBkFtoug
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_vrGynyzBBkFtoug
+
+.L_16_blocks_overflow_vrGynyzBBkFtoug:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_vrGynyzBBkFtoug:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vbpuzolxwysglov
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vbpuzolxwysglov
+.L_small_initial_partial_block_vbpuzolxwysglov:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_vbpuzolxwysglov:
+
+ orq %r8,%r8
+ je .L_after_reduction_vbpuzolxwysglov
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_vbpuzolxwysglov:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_5_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_kkiaoGfqlrecpbg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_kkiaoGfqlrecpbg
+
+.L_16_blocks_overflow_kkiaoGfqlrecpbg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_kkiaoGfqlrecpbg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ephjiBFojtbqzgd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ephjiBFojtbqzgd
+.L_small_initial_partial_block_ephjiBFojtbqzgd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ephjiBFojtbqzgd:
+
+ orq %r8,%r8
+ je .L_after_reduction_ephjiBFojtbqzgd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ephjiBFojtbqzgd:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_6_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_BGjhpBrnvbegsga
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_BGjhpBrnvbegsga
+
+.L_16_blocks_overflow_BGjhpBrnvbegsga:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_BGjhpBrnvbegsga:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fcljjovquiEbomB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fcljjovquiEbomB
+.L_small_initial_partial_block_fcljjovquiEbomB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fcljjovquiEbomB:
+
+ orq %r8,%r8
+ je .L_after_reduction_fcljjovquiEbomB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fcljjovquiEbomB:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_7_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_izrwrwtizdFmmop
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_izrwrwtizdFmmop
+
+.L_16_blocks_overflow_izrwrwtizdFmmop:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_izrwrwtizdFmmop:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BGxuGiljxiGuGwj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BGxuGiljxiGuGwj
+.L_small_initial_partial_block_BGxuGiljxiGuGwj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BGxuGiljxiGuGwj:
+
+ orq %r8,%r8
+ je .L_after_reduction_BGxuGiljxiGuGwj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BGxuGiljxiGuGwj:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_8_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_uokAwEtutqrxEoF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_uokAwEtutqrxEoF
+
+.L_16_blocks_overflow_uokAwEtutqrxEoF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_uokAwEtutqrxEoF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CannrFuxFceaxhk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CannrFuxFceaxhk
+.L_small_initial_partial_block_CannrFuxFceaxhk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CannrFuxFceaxhk:
+
+ orq %r8,%r8
+ je .L_after_reduction_CannrFuxFceaxhk
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CannrFuxFceaxhk:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_9_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_ydCuzccyysxjEtE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_ydCuzccyysxjEtE
+
+.L_16_blocks_overflow_ydCuzccyysxjEtE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_ydCuzccyysxjEtE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hlxwfcoEeochjmF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hlxwfcoEeochjmF
+.L_small_initial_partial_block_hlxwfcoEeochjmF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hlxwfcoEeochjmF:
+
+ orq %r8,%r8
+ je .L_after_reduction_hlxwfcoEeochjmF
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hlxwfcoEeochjmF:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_10_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_uhxcibFtDluhCCB
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_uhxcibFtDluhCCB
+
+.L_16_blocks_overflow_uhxcibFtDluhCCB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_uhxcibFtDluhCCB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uwCCphGGeEaqtbf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uwCCphGGeEaqtbf
+.L_small_initial_partial_block_uwCCphGGeEaqtbf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uwCCphGGeEaqtbf:
+
+ orq %r8,%r8
+ je .L_after_reduction_uwCCphGGeEaqtbf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uwCCphGGeEaqtbf:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_11_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_ndAbfmoGyFeFtFs
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_ndAbfmoGyFeFtFs
+
+.L_16_blocks_overflow_ndAbfmoGyFeFtFs:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_ndAbfmoGyFeFtFs:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tojfqqaoGtkzuaq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tojfqqaoGtkzuaq
+.L_small_initial_partial_block_tojfqqaoGtkzuaq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tojfqqaoGtkzuaq:
+
+ orq %r8,%r8
+ je .L_after_reduction_tojfqqaoGtkzuaq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tojfqqaoGtkzuaq:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_12_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_rwelfyvzphiDsjE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_rwelfyvzphiDsjE
+
+.L_16_blocks_overflow_rwelfyvzphiDsjE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_rwelfyvzphiDsjE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_CzrAuaBADCucxbj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_CzrAuaBADCucxbj
+.L_small_initial_partial_block_CzrAuaBADCucxbj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_CzrAuaBADCucxbj:
+
+ orq %r8,%r8
+ je .L_after_reduction_CzrAuaBADCucxbj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_CzrAuaBADCucxbj:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_13_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_aizclGCjAeGBapi
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_aizclGCjAeGBapi
+
+.L_16_blocks_overflow_aizclGCjAeGBapi:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_aizclGCjAeGBapi:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_rsvakfaFrrcdnmn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_rsvakfaFrrcdnmn
+.L_small_initial_partial_block_rsvakfaFrrcdnmn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_rsvakfaFrrcdnmn:
+
+ orq %r8,%r8
+ je .L_after_reduction_rsvakfaFrrcdnmn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_rsvakfaFrrcdnmn:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_14_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_CifFuwhmDnsajva
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_CifFuwhmDnsajva
+
+.L_16_blocks_overflow_CifFuwhmDnsajva:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_CifFuwhmDnsajva:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_eAqADtqcmpkizGe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_eAqADtqcmpkizGe
+.L_small_initial_partial_block_eAqADtqcmpkizGe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_eAqADtqcmpkizGe:
+
+ orq %r8,%r8
+ je .L_after_reduction_eAqADtqcmpkizGe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_eAqADtqcmpkizGe:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_15_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_oiyvxmCxqthGqom
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_oiyvxmCxqthGqom
+
+.L_16_blocks_overflow_oiyvxmCxqthGqom:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_oiyvxmCxqthGqom:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ugFbqvmchjEBBBz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ugFbqvmchjEBBBz
+.L_small_initial_partial_block_ugFbqvmchjEBBBz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ugFbqvmchjEBBBz:
+
+ orq %r8,%r8
+ je .L_after_reduction_ugFbqvmchjEBBBz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ugFbqvmchjEBBBz:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_16_ClvEnqtsgcyzxra:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_wCdnfleczoFcEzf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_wCdnfleczoFcEzf
+
+.L_16_blocks_overflow_wCdnfleczoFcEzf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_wCdnfleczoFcEzf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm14,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_qkhBhqDFAyxsceq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qkhBhqDFAyxsceq:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qkhBhqDFAyxsceq:
+ jmp .L_last_blocks_done_ClvEnqtsgcyzxra
+.L_last_num_blocks_is_0_ClvEnqtsgcyzxra:
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_ClvEnqtsgcyzxra:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_kgypzeldFqsBnqw
+.L_encrypt_32_blocks_kgypzeldFqsBnqw:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_vGiehzfobkckAyi
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_vGiehzfobkckAyi
+.L_16_blocks_overflow_vGiehzfobkckAyi:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_vGiehzfobkckAyi:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_aBfhhtmiojjovim
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_aBfhhtmiojjovim
+.L_16_blocks_overflow_aBfhhtmiojjovim:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_aBfhhtmiojjovim:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1024(%rsp),%zmm8
+ vmovdqu64 256(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 320(%rsp),%zmm18
+ vmovdqa64 1088(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 384(%rsp),%zmm1
+ vmovdqa64 1152(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 448(%rsp),%zmm18
+ vmovdqa64 1216(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 256(%rcx,%r11,1),%zmm17
+ vmovdqu8 320(%rcx,%r11,1),%zmm19
+ vmovdqu8 384(%rcx,%r11,1),%zmm20
+ vmovdqu8 448(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm15,%zmm10,%zmm26
+ vpternlogq $0x96,%zmm12,%zmm6,%zmm24
+ vpternlogq $0x96,%zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,256(%r10,%r11,1)
+ vmovdqu8 %zmm3,320(%r10,%r11,1)
+ vmovdqu8 %zmm4,384(%r10,%r11,1)
+ vmovdqu8 %zmm5,448(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,768(%rsp)
+ vmovdqa64 %zmm3,832(%rsp)
+ vmovdqa64 %zmm4,896(%rsp)
+ vmovdqa64 %zmm5,960(%rsp)
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+ subq $512,%r8
+ addq $512,%r11
+ movl %r8d,%r10d
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_AwFklinDrcbFgzn
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_AwFklinDrcbFgzn
+ jb .L_last_num_blocks_is_7_1_AwFklinDrcbFgzn
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_AwFklinDrcbFgzn
+ jb .L_last_num_blocks_is_11_9_AwFklinDrcbFgzn
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_AwFklinDrcbFgzn
+ ja .L_last_num_blocks_is_16_AwFklinDrcbFgzn
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_AwFklinDrcbFgzn
+ jmp .L_last_num_blocks_is_13_AwFklinDrcbFgzn
+
+.L_last_num_blocks_is_11_9_AwFklinDrcbFgzn:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_AwFklinDrcbFgzn
+ ja .L_last_num_blocks_is_11_AwFklinDrcbFgzn
+ jmp .L_last_num_blocks_is_9_AwFklinDrcbFgzn
+
+.L_last_num_blocks_is_7_1_AwFklinDrcbFgzn:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_AwFklinDrcbFgzn
+ jb .L_last_num_blocks_is_3_1_AwFklinDrcbFgzn
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_AwFklinDrcbFgzn
+ je .L_last_num_blocks_is_6_AwFklinDrcbFgzn
+ jmp .L_last_num_blocks_is_5_AwFklinDrcbFgzn
+
+.L_last_num_blocks_is_3_1_AwFklinDrcbFgzn:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_AwFklinDrcbFgzn
+ je .L_last_num_blocks_is_2_AwFklinDrcbFgzn
+.L_last_num_blocks_is_1_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_FvFeevCgruEuomy
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_FvFeevCgruEuomy
+
+.L_16_blocks_overflow_FvFeevCgruEuomy:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_FvFeevCgruEuomy:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_vocdDxlyexcAqgk
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_vocdDxlyexcAqgk
+.L_small_initial_partial_block_vocdDxlyexcAqgk:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_vocdDxlyexcAqgk
+.L_small_initial_compute_done_vocdDxlyexcAqgk:
+.L_after_reduction_vocdDxlyexcAqgk:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_2_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_rufCyEuzhyCcBum
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_rufCyEuzhyCcBum
+
+.L_16_blocks_overflow_rufCyEuzhyCcBum:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_rufCyEuzhyCcBum:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hFhwFAnywtirqFm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hFhwFAnywtirqFm
+.L_small_initial_partial_block_hFhwFAnywtirqFm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hFhwFAnywtirqFm:
+
+ orq %r8,%r8
+ je .L_after_reduction_hFhwFAnywtirqFm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hFhwFAnywtirqFm:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_3_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_oiFAsBBekBeEcll
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_oiFAsBBekBeEcll
+
+.L_16_blocks_overflow_oiFAsBBekBeEcll:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_oiFAsBBekBeEcll:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DakDxmbzhjsFccp
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DakDxmbzhjsFccp
+.L_small_initial_partial_block_DakDxmbzhjsFccp:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DakDxmbzhjsFccp:
+
+ orq %r8,%r8
+ je .L_after_reduction_DakDxmbzhjsFccp
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DakDxmbzhjsFccp:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_4_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_EeBjyjCzBemkiyn
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_EeBjyjCzBemkiyn
+
+.L_16_blocks_overflow_EeBjyjCzBemkiyn:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_EeBjyjCzBemkiyn:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pkDoGcykctqxwtv
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pkDoGcykctqxwtv
+.L_small_initial_partial_block_pkDoGcykctqxwtv:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pkDoGcykctqxwtv:
+
+ orq %r8,%r8
+ je .L_after_reduction_pkDoGcykctqxwtv
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pkDoGcykctqxwtv:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_5_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_ygonEcumvGgxonp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_ygonEcumvGgxonp
+
+.L_16_blocks_overflow_ygonEcumvGgxonp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_ygonEcumvGgxonp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_FBDnovehzAhxoFz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_FBDnovehzAhxoFz
+.L_small_initial_partial_block_FBDnovehzAhxoFz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_FBDnovehzAhxoFz:
+
+ orq %r8,%r8
+ je .L_after_reduction_FBDnovehzAhxoFz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_FBDnovehzAhxoFz:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_6_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_zAwamddcsGuDbsw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_zAwamddcsGuDbsw
+
+.L_16_blocks_overflow_zAwamddcsGuDbsw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_zAwamddcsGuDbsw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_nBiEFoifDnlnCnA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_nBiEFoifDnlnCnA
+.L_small_initial_partial_block_nBiEFoifDnlnCnA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_nBiEFoifDnlnCnA:
+
+ orq %r8,%r8
+ je .L_after_reduction_nBiEFoifDnlnCnA
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_nBiEFoifDnlnCnA:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_7_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_pwBmqBGFfnBFiBx
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_pwBmqBGFfnBFiBx
+
+.L_16_blocks_overflow_pwBmqBGFfnBFiBx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_pwBmqBGFfnBFiBx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wChogqeEderiszq
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wChogqeEderiszq
+.L_small_initial_partial_block_wChogqeEderiszq:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wChogqeEderiszq:
+
+ orq %r8,%r8
+ je .L_after_reduction_wChogqeEderiszq
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wChogqeEderiszq:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_8_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_xgcteGoksvqdvwC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_xgcteGoksvqdvwC
+
+.L_16_blocks_overflow_xgcteGoksvqdvwC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_xgcteGoksvqdvwC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bwfvAfrqwqvnlGG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bwfvAfrqwqvnlGG
+.L_small_initial_partial_block_bwfvAfrqwqvnlGG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bwfvAfrqwqvnlGG:
+
+ orq %r8,%r8
+ je .L_after_reduction_bwfvAfrqwqvnlGG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bwfvAfrqwqvnlGG:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_9_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_nGFogvFjmdjnsvt
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_nGFogvFjmdjnsvt
+
+.L_16_blocks_overflow_nGFogvFjmdjnsvt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_nGFogvFjmdjnsvt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pkinwzuhxhaEgCa
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pkinwzuhxhaEgCa
+.L_small_initial_partial_block_pkinwzuhxhaEgCa:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pkinwzuhxhaEgCa:
+
+ orq %r8,%r8
+ je .L_after_reduction_pkinwzuhxhaEgCa
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_pkinwzuhxhaEgCa:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_10_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_ryszgunyrqgvyfB
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_ryszgunyrqgvyfB
+
+.L_16_blocks_overflow_ryszgunyrqgvyfB:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_ryszgunyrqgvyfB:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jypDCauhjquEuyb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jypDCauhjquEuyb
+.L_small_initial_partial_block_jypDCauhjquEuyb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jypDCauhjquEuyb:
+
+ orq %r8,%r8
+ je .L_after_reduction_jypDCauhjquEuyb
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jypDCauhjquEuyb:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_11_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_DvudExkamyfuGdv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_DvudExkamyfuGdv
+
+.L_16_blocks_overflow_DvudExkamyfuGdv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_DvudExkamyfuGdv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_dlfpdlkfExhwjDu
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_dlfpdlkfExhwjDu
+.L_small_initial_partial_block_dlfpdlkfExhwjDu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_dlfpdlkfExhwjDu:
+
+ orq %r8,%r8
+ je .L_after_reduction_dlfpdlkfExhwjDu
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_dlfpdlkfExhwjDu:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_12_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_pycvwiovDfFylBw
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_pycvwiovDfFylBw
+
+.L_16_blocks_overflow_pycvwiovDfFylBw:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_pycvwiovDfFylBw:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DazlrGdgfFiEaoe
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DazlrGdgfFiEaoe
+.L_small_initial_partial_block_DazlrGdgfFiEaoe:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DazlrGdgfFiEaoe:
+
+ orq %r8,%r8
+ je .L_after_reduction_DazlrGdgfFiEaoe
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DazlrGdgfFiEaoe:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_13_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_sFwEGaAnGxDowcc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_sFwEGaAnGxDowcc
+
+.L_16_blocks_overflow_sFwEGaAnGxDowcc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_sFwEGaAnGxDowcc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_tohyxsArdntzjGo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_tohyxsArdntzjGo
+.L_small_initial_partial_block_tohyxsArdntzjGo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_tohyxsArdntzjGo:
+
+ orq %r8,%r8
+ je .L_after_reduction_tohyxsArdntzjGo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_tohyxsArdntzjGo:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_14_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_fapGrcjmuhklgzo
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_fapGrcjmuhklgzo
+
+.L_16_blocks_overflow_fapGrcjmuhklgzo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_fapGrcjmuhklgzo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BeFutuwFnozaige
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BeFutuwFnozaige
+.L_small_initial_partial_block_BeFutuwFnozaige:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BeFutuwFnozaige:
+
+ orq %r8,%r8
+ je .L_after_reduction_BeFutuwFnozaige
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BeFutuwFnozaige:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_15_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_aByDeEDFBCjvqGx
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_aByDeEDFBCjvqGx
+
+.L_16_blocks_overflow_aByDeEDFBCjvqGx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_aByDeEDFBCjvqGx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hAxtmivtdwAsvmz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hAxtmivtdwAsvmz
+.L_small_initial_partial_block_hAxtmivtdwAsvmz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hAxtmivtdwAsvmz:
+
+ orq %r8,%r8
+ je .L_after_reduction_hAxtmivtdwAsvmz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hAxtmivtdwAsvmz:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_16_AwFklinDrcbFgzn:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_BwrcaiuzmxchdBE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_BwrcaiuzmxchdBE
+
+.L_16_blocks_overflow_BwrcaiuzmxchdBE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_BwrcaiuzmxchdBE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_xniaaigktwmycDh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xniaaigktwmycDh:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xniaaigktwmycDh:
+ jmp .L_last_blocks_done_AwFklinDrcbFgzn
+.L_last_num_blocks_is_0_AwFklinDrcbFgzn:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_AwFklinDrcbFgzn:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_kgypzeldFqsBnqw
+.L_encrypt_16_blocks_kgypzeldFqsBnqw:
+ cmpb $240,%r15b
+ jae .L_16_blocks_overflow_itlreegehzzFvho
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_itlreegehzzFvho
+.L_16_blocks_overflow_itlreegehzzFvho:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_itlreegehzzFvho:
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp),%zmm1
+
+
+
+
+ vshufi64x2 $255,%zmm5,%zmm5,%zmm2
+ addb $16,%r15b
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+
+
+
+
+
+
+
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm6
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm6
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+
+
+
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21
+
+
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm15,%zmm10,%zmm26
+ vpxorq %zmm12,%zmm6,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+
+
+
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+
+
+
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1)
+ vpshufb %zmm29,%zmm17,%zmm0
+ vpshufb %zmm29,%zmm19,%zmm3
+ vpshufb %zmm29,%zmm20,%zmm4
+ vpshufb %zmm29,%zmm21,%zmm5
+ vmovdqa64 %zmm0,1280(%rsp)
+ vmovdqa64 %zmm3,1344(%rsp)
+ vmovdqa64 %zmm4,1408(%rsp)
+ vmovdqa64 %zmm5,1472(%rsp)
+ vmovdqa64 1024(%rsp),%zmm13
+ vmovdqu64 256(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1088(%rsp),%zmm13
+ vmovdqu64 320(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1152(%rsp),%zmm13
+ vmovdqu64 384(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1216(%rsp),%zmm13
+ vmovdqu64 448(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_xAfbdFbjfoyBlDz
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_xAfbdFbjfoyBlDz
+ jb .L_last_num_blocks_is_7_1_xAfbdFbjfoyBlDz
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_xAfbdFbjfoyBlDz
+ jb .L_last_num_blocks_is_11_9_xAfbdFbjfoyBlDz
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_xAfbdFbjfoyBlDz
+ ja .L_last_num_blocks_is_16_xAfbdFbjfoyBlDz
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_xAfbdFbjfoyBlDz
+ jmp .L_last_num_blocks_is_13_xAfbdFbjfoyBlDz
+
+.L_last_num_blocks_is_11_9_xAfbdFbjfoyBlDz:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_xAfbdFbjfoyBlDz
+ ja .L_last_num_blocks_is_11_xAfbdFbjfoyBlDz
+ jmp .L_last_num_blocks_is_9_xAfbdFbjfoyBlDz
+
+.L_last_num_blocks_is_7_1_xAfbdFbjfoyBlDz:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_xAfbdFbjfoyBlDz
+ jb .L_last_num_blocks_is_3_1_xAfbdFbjfoyBlDz
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_xAfbdFbjfoyBlDz
+ je .L_last_num_blocks_is_6_xAfbdFbjfoyBlDz
+ jmp .L_last_num_blocks_is_5_xAfbdFbjfoyBlDz
+
+.L_last_num_blocks_is_3_1_xAfbdFbjfoyBlDz:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_xAfbdFbjfoyBlDz
+ je .L_last_num_blocks_is_2_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_1_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_lapolqbccExufla
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_lapolqbccExufla
+
+.L_16_blocks_overflow_lapolqbccExufla:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_lapolqbccExufla:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %xmm31,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aksayyCEvBwkqCs
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aksayyCEvBwkqCs
+.L_small_initial_partial_block_aksayyCEvBwkqCs:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_aksayyCEvBwkqCs
+.L_small_initial_compute_done_aksayyCEvBwkqCs:
+.L_after_reduction_aksayyCEvBwkqCs:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_2_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_EnCCsEpwCxDywbA
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_EnCCsEpwCxDywbA
+
+.L_16_blocks_overflow_EnCCsEpwCxDywbA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_EnCCsEpwCxDywbA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %ymm31,%ymm0,%ymm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_enwlcwbgseiBryB
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_enwlcwbgseiBryB
+.L_small_initial_partial_block_enwlcwbgseiBryB:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_enwlcwbgseiBryB:
+
+ orq %r8,%r8
+ je .L_after_reduction_enwlcwbgseiBryB
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_enwlcwbgseiBryB:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_3_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_bEsbraEgeohwpzz
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_bEsbraEgeohwpzz
+
+.L_16_blocks_overflow_bEsbraEgeohwpzz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_bEsbraEgeohwpzz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_jrkEfawFjAdFFAw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_jrkEfawFjAdFFAw
+.L_small_initial_partial_block_jrkEfawFjAdFFAw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_jrkEfawFjAdFFAw:
+
+ orq %r8,%r8
+ je .L_after_reduction_jrkEfawFjAdFFAw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_jrkEfawFjAdFFAw:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_4_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_jxvxvtaszlAuveu
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_jxvxvtaszlAuveu
+
+.L_16_blocks_overflow_jxvxvtaszlAuveu:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_jxvxvtaszlAuveu:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_BoECtwduirkpGbd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_BoECtwduirkpGbd
+.L_small_initial_partial_block_BoECtwduirkpGbd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_BoECtwduirkpGbd:
+
+ orq %r8,%r8
+ je .L_after_reduction_BoECtwduirkpGbd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_BoECtwduirkpGbd:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_5_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_AemnsnzilvGaDvl
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_AemnsnzilvGaDvl
+
+.L_16_blocks_overflow_AemnsnzilvGaDvl:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_AemnsnzilvGaDvl:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_AChbnzckEtGqvia
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_AChbnzckEtGqvia
+.L_small_initial_partial_block_AChbnzckEtGqvia:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AChbnzckEtGqvia:
+
+ orq %r8,%r8
+ je .L_after_reduction_AChbnzckEtGqvia
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AChbnzckEtGqvia:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_6_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_pGnpmuquowsenAC
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_pGnpmuquowsenAC
+
+.L_16_blocks_overflow_pGnpmuquowsenAC:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_pGnpmuquowsenAC:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_kcatvpdGCtefzAw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_kcatvpdGCtefzAw
+.L_small_initial_partial_block_kcatvpdGCtefzAw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_kcatvpdGCtefzAw:
+
+ orq %r8,%r8
+ je .L_after_reduction_kcatvpdGCtefzAw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_kcatvpdGCtefzAw:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_7_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_vBcFztzloamdDFg
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_vBcFztzloamdDFg
+
+.L_16_blocks_overflow_vBcFztzloamdDFg:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_vBcFztzloamdDFg:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yfFcsqkvhbddwyy
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yfFcsqkvhbddwyy
+.L_small_initial_partial_block_yfFcsqkvhbddwyy:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yfFcsqkvhbddwyy:
+
+ orq %r8,%r8
+ je .L_after_reduction_yfFcsqkvhbddwyy
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_yfFcsqkvhbddwyy:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_8_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_FdAnkzzirEtjwrb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_FdAnkzzirEtjwrb
+
+.L_16_blocks_overflow_FdAnkzzirEtjwrb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_FdAnkzzirEtjwrb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wvyqkgDlqezddls
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wvyqkgDlqezddls
+.L_small_initial_partial_block_wvyqkgDlqezddls:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wvyqkgDlqezddls:
+
+ orq %r8,%r8
+ je .L_after_reduction_wvyqkgDlqezddls
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wvyqkgDlqezddls:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_9_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_lhtDngmdlssnvDG
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_lhtDngmdlssnvDG
+
+.L_16_blocks_overflow_lhtDngmdlssnvDG:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_lhtDngmdlssnvDG:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ptjDGBmufbAkAGG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ptjDGBmufbAkAGG
+.L_small_initial_partial_block_ptjDGBmufbAkAGG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ptjDGBmufbAkAGG:
+
+ orq %r8,%r8
+ je .L_after_reduction_ptjDGBmufbAkAGG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ptjDGBmufbAkAGG:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_10_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_wsaFiGmrqxypimt
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_wsaFiGmrqxypimt
+
+.L_16_blocks_overflow_wsaFiGmrqxypimt:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_wsaFiGmrqxypimt:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gnctxlhtglgbgvx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gnctxlhtglgbgvx
+.L_small_initial_partial_block_gnctxlhtglgbgvx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gnctxlhtglgbgvx:
+
+ orq %r8,%r8
+ je .L_after_reduction_gnctxlhtglgbgvx
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_gnctxlhtglgbgvx:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_11_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_neydhuxthowjDfe
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_neydhuxthowjDfe
+
+.L_16_blocks_overflow_neydhuxthowjDfe:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_neydhuxthowjDfe:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_btfsxwwBfubFEhw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_btfsxwwBfubFEhw
+.L_small_initial_partial_block_btfsxwwBfubFEhw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_btfsxwwBfubFEhw:
+
+ orq %r8,%r8
+ je .L_after_reduction_btfsxwwBfubFEhw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_btfsxwwBfubFEhw:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_12_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_vmmvFmFAAqpDrjc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_vmmvFmFAAqpDrjc
+
+.L_16_blocks_overflow_vmmvFmFAAqpDrjc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_vmmvFmFAAqpDrjc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_muxxrlxFvpCuucj
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_muxxrlxFvpCuucj
+.L_small_initial_partial_block_muxxrlxFvpCuucj:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_muxxrlxFvpCuucj:
+
+ orq %r8,%r8
+ je .L_after_reduction_muxxrlxFvpCuucj
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_muxxrlxFvpCuucj:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_13_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_BtCEtGboibyzmkz
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_BtCEtGboibyzmkz
+
+.L_16_blocks_overflow_BtCEtGboibyzmkz:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_BtCEtGboibyzmkz:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_niubrurEemqlCeh
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_niubrurEemqlCeh
+.L_small_initial_partial_block_niubrurEemqlCeh:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_niubrurEemqlCeh:
+
+ orq %r8,%r8
+ je .L_after_reduction_niubrurEemqlCeh
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_niubrurEemqlCeh:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_14_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_mybAsEhdaxgnGrE
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_mybAsEhdaxgnGrE
+
+.L_16_blocks_overflow_mybAsEhdaxgnGrE:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_mybAsEhdaxgnGrE:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qtDEunzdagagyyt
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qtDEunzdagagyyt
+.L_small_initial_partial_block_qtDEunzdagagyyt:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qtDEunzdagagyyt:
+
+ orq %r8,%r8
+ je .L_after_reduction_qtDEunzdagagyyt
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qtDEunzdagagyyt:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_15_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_Bofftlllstcnhmp
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_Bofftlllstcnhmp
+
+.L_16_blocks_overflow_Bofftlllstcnhmp:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_Bofftlllstcnhmp:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ahcvvxeChlezaBm
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ahcvvxeChlezaBm
+.L_small_initial_partial_block_ahcvvxeChlezaBm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ahcvvxeChlezaBm:
+
+ orq %r8,%r8
+ je .L_after_reduction_ahcvvxeChlezaBm
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_ahcvvxeChlezaBm:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_16_xAfbdFbjfoyBlDz:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_nowrnsGGyachzjc
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_nowrnsGGyachzjc
+
+.L_16_blocks_overflow_nowrnsGGyachzjc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_nowrnsGGyachzjc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vmovdqa64 1280(%rsp),%zmm8
+ vmovdqu64 512(%rsp),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 576(%rsp),%zmm18
+ vmovdqa64 1344(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 640(%rsp),%zmm1
+ vmovdqa64 1408(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 704(%rsp),%zmm18
+ vmovdqa64 1472(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpternlogq $0x96,%zmm12,%zmm24,%zmm14
+ vpternlogq $0x96,%zmm13,%zmm25,%zmm7
+ vpternlogq $0x96,%zmm15,%zmm26,%zmm10
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vpsrldq $8,%zmm10,%zmm15
+ vpslldq $8,%zmm10,%zmm10
+
+ vmovdqa64 POLY2(%rip),%xmm16
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vpxorq %zmm15,%zmm14,%zmm14
+ vpxorq %zmm10,%zmm7,%zmm7
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vextracti64x4 $1,%zmm14,%ymm12
+ vpxorq %ymm12,%ymm14,%ymm14
+ vextracti32x4 $1,%ymm14,%xmm12
+ vpxorq %xmm12,%xmm14,%xmm14
+ vextracti64x4 $1,%zmm7,%ymm13
+ vpxorq %ymm13,%ymm7,%ymm7
+ vextracti32x4 $1,%ymm7,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm7
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vpclmulqdq $0x01,%xmm7,%xmm16,%xmm13
+ vpslldq $8,%xmm13,%xmm13
+ vpxorq %xmm13,%xmm7,%xmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vpclmulqdq $0x00,%xmm13,%xmm16,%xmm12
+ vpsrldq $4,%xmm12,%xmm12
+ vpclmulqdq $0x10,%xmm13,%xmm16,%xmm15
+ vpslldq $4,%xmm15,%xmm15
+
+ vpternlogq $0x96,%xmm12,%xmm15,%xmm14
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_AoBCchcjotapvgu:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vpxorq %zmm14,%zmm17,%zmm17
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm31,%zmm5,%zmm5
+ vpxorq %zmm8,%zmm0,%zmm0
+ vpxorq %zmm22,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_AoBCchcjotapvgu:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_AoBCchcjotapvgu:
+ jmp .L_last_blocks_done_xAfbdFbjfoyBlDz
+.L_last_num_blocks_is_0_xAfbdFbjfoyBlDz:
+ vmovdqa64 1280(%rsp),%zmm13
+ vmovdqu64 512(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1344(%rsp),%zmm13
+ vmovdqu64 576(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 1408(%rsp),%zmm13
+ vmovdqu64 640(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 1472(%rsp),%zmm13
+ vmovdqu64 704(%rsp),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_xAfbdFbjfoyBlDz:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_kgypzeldFqsBnqw
+
+.L_message_below_32_blocks_kgypzeldFqsBnqw:
+
+
+ subq $256,%r8
+ addq $256,%r11
+ movl %r8d,%r10d
+ testq %r14,%r14
+ jnz .L_skip_hkeys_precomputation_qckdlimbBeqylyq
+ vmovdqu64 640(%rsp),%zmm3
+
+
+ vshufi64x2 $0x00,%zmm3,%zmm3,%zmm3
+
+ vmovdqu64 576(%rsp),%zmm4
+ vmovdqu64 512(%rsp),%zmm5
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,448(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,384(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm4,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm4,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm4,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm4,%zmm4
+
+ vpsrldq $8,%zmm4,%zmm10
+ vpslldq $8,%zmm4,%zmm4
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm4,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm4,%zmm4
+
+
+
+ vpclmulqdq $0x00,%zmm4,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm4,%zmm10,%zmm4
+ vpslldq $4,%zmm4,%zmm4
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm4
+
+ vmovdqu64 %zmm4,320(%rsp)
+
+ vpclmulqdq $0x11,%zmm3,%zmm5,%zmm6
+ vpclmulqdq $0x00,%zmm3,%zmm5,%zmm7
+ vpclmulqdq $0x01,%zmm3,%zmm5,%zmm10
+ vpclmulqdq $0x10,%zmm3,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm5,%zmm5
+
+ vpsrldq $8,%zmm5,%zmm10
+ vpslldq $8,%zmm5,%zmm5
+ vpxorq %zmm10,%zmm6,%zmm6
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vmovdqu64 POLY2(%rip),%zmm10
+
+ vpclmulqdq $0x01,%zmm5,%zmm10,%zmm7
+ vpslldq $8,%zmm7,%zmm7
+ vpxorq %zmm7,%zmm5,%zmm5
+
+
+
+ vpclmulqdq $0x00,%zmm5,%zmm10,%zmm7
+ vpsrldq $4,%zmm7,%zmm7
+ vpclmulqdq $0x10,%zmm5,%zmm10,%zmm5
+ vpslldq $4,%zmm5,%zmm5
+
+ vpternlogq $0x96,%zmm7,%zmm6,%zmm5
+
+ vmovdqu64 %zmm5,256(%rsp)
+.L_skip_hkeys_precomputation_qckdlimbBeqylyq:
+ movq $1,%r14
+ andl $~15,%r10d
+ movl $512,%ebx
+ subl %r10d,%ebx
+ movl %r8d,%r10d
+ addl $15,%r10d
+ shrl $4,%r10d
+ je .L_last_num_blocks_is_0_qdswuDcxyhGmasp
+
+ cmpl $8,%r10d
+ je .L_last_num_blocks_is_8_qdswuDcxyhGmasp
+ jb .L_last_num_blocks_is_7_1_qdswuDcxyhGmasp
+
+
+ cmpl $12,%r10d
+ je .L_last_num_blocks_is_12_qdswuDcxyhGmasp
+ jb .L_last_num_blocks_is_11_9_qdswuDcxyhGmasp
+
+
+ cmpl $15,%r10d
+ je .L_last_num_blocks_is_15_qdswuDcxyhGmasp
+ ja .L_last_num_blocks_is_16_qdswuDcxyhGmasp
+ cmpl $14,%r10d
+ je .L_last_num_blocks_is_14_qdswuDcxyhGmasp
+ jmp .L_last_num_blocks_is_13_qdswuDcxyhGmasp
+
+.L_last_num_blocks_is_11_9_qdswuDcxyhGmasp:
+
+ cmpl $10,%r10d
+ je .L_last_num_blocks_is_10_qdswuDcxyhGmasp
+ ja .L_last_num_blocks_is_11_qdswuDcxyhGmasp
+ jmp .L_last_num_blocks_is_9_qdswuDcxyhGmasp
+
+.L_last_num_blocks_is_7_1_qdswuDcxyhGmasp:
+ cmpl $4,%r10d
+ je .L_last_num_blocks_is_4_qdswuDcxyhGmasp
+ jb .L_last_num_blocks_is_3_1_qdswuDcxyhGmasp
+
+ cmpl $6,%r10d
+ ja .L_last_num_blocks_is_7_qdswuDcxyhGmasp
+ je .L_last_num_blocks_is_6_qdswuDcxyhGmasp
+ jmp .L_last_num_blocks_is_5_qdswuDcxyhGmasp
+
+.L_last_num_blocks_is_3_1_qdswuDcxyhGmasp:
+
+ cmpl $2,%r10d
+ ja .L_last_num_blocks_is_3_qdswuDcxyhGmasp
+ je .L_last_num_blocks_is_2_qdswuDcxyhGmasp
+.L_last_num_blocks_is_1_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $255,%r15d
+ jae .L_16_blocks_overflow_AqvkjwfuBmvGzFo
+ vpaddd %xmm28,%xmm2,%xmm0
+ jmp .L_16_blocks_ok_AqvkjwfuBmvGzFo
+
+.L_16_blocks_overflow_AqvkjwfuBmvGzFo:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %xmm29,%xmm0,%xmm0
+.L_16_blocks_ok_AqvkjwfuBmvGzFo:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%xmm17{%k1}{z}
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %xmm30,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %xmm31,%xmm0,%xmm0
+ vaesenclast %xmm30,%xmm0,%xmm0
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %xmm29,%xmm17,%xmm17
+ vextracti32x4 $0,%zmm17,%xmm7
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zDugdiozxlCaAFc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zDugdiozxlCaAFc
+.L_small_initial_partial_block_zDugdiozxlCaAFc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm0
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm0,%xmm3
+ vpslldq $8,%xmm3,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm3
+
+
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm4,%xmm14
+
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm7,%xmm14,%xmm14
+
+ jmp .L_after_reduction_zDugdiozxlCaAFc
+.L_small_initial_compute_done_zDugdiozxlCaAFc:
+.L_after_reduction_zDugdiozxlCaAFc:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_2_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $254,%r15d
+ jae .L_16_blocks_overflow_BFBqcyfExFAkGzj
+ vpaddd %ymm28,%ymm2,%ymm0
+ jmp .L_16_blocks_ok_BFBqcyfExFAkGzj
+
+.L_16_blocks_overflow_BFBqcyfExFAkGzj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %ymm29,%ymm0,%ymm0
+.L_16_blocks_ok_BFBqcyfExFAkGzj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%ymm17{%k1}{z}
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %ymm30,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %ymm31,%ymm0,%ymm0
+ vaesenclast %ymm30,%ymm0,%ymm0
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %ymm29,%ymm17,%ymm17
+ vextracti32x4 $1,%zmm17,%xmm7
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_bgisyxAEeEpkobG
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_bgisyxAEeEpkobG
+.L_small_initial_partial_block_bgisyxAEeEpkobG:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm17,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm17,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm17,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm17,%xmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_bgisyxAEeEpkobG:
+
+ orq %r8,%r8
+ je .L_after_reduction_bgisyxAEeEpkobG
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_bgisyxAEeEpkobG:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_3_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $253,%r15d
+ jae .L_16_blocks_overflow_yizvcDtiefGCDev
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_yizvcDtiefGCDev
+
+.L_16_blocks_overflow_yizvcDtiefGCDev:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_yizvcDtiefGCDev:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $2,%zmm17,%xmm7
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fegyzcDscsgdCgo
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fegyzcDscsgdCgo
+.L_small_initial_partial_block_fegyzcDscsgdCgo:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm17,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm17,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm17,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm17,%ymm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fegyzcDscsgdCgo:
+
+ orq %r8,%r8
+ je .L_after_reduction_fegyzcDscsgdCgo
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_fegyzcDscsgdCgo:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_4_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $252,%r15d
+ jae .L_16_blocks_overflow_DGjzymFiusiuxvc
+ vpaddd %zmm28,%zmm2,%zmm0
+ jmp .L_16_blocks_ok_DGjzymFiusiuxvc
+
+.L_16_blocks_overflow_DGjzymFiusiuxvc:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpshufb %zmm29,%zmm0,%zmm0
+.L_16_blocks_ok_DGjzymFiusiuxvc:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vpxorq %zmm17,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm17,%zmm17{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vextracti32x4 $3,%zmm17,%xmm7
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DyGAAdrBpclAjrf
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DyGAAdrBpclAjrf
+.L_small_initial_partial_block_DyGAAdrBpclAjrf:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpxorq %zmm26,%zmm4,%zmm4
+ vpxorq %zmm24,%zmm0,%zmm0
+ vpxorq %zmm25,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DyGAAdrBpclAjrf:
+
+ orq %r8,%r8
+ je .L_after_reduction_DyGAAdrBpclAjrf
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_DyGAAdrBpclAjrf:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_5_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $251,%r15d
+ jae .L_16_blocks_overflow_qmnbjAabAnlrekx
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %xmm27,%xmm0,%xmm3
+ jmp .L_16_blocks_ok_qmnbjAabAnlrekx
+
+.L_16_blocks_overflow_qmnbjAabAnlrekx:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+.L_16_blocks_ok_qmnbjAabAnlrekx:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%xmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %xmm30,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %xmm31,%xmm3,%xmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %xmm30,%xmm3,%xmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %xmm19,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %xmm29,%xmm19,%xmm19
+ vextracti32x4 $0,%zmm19,%xmm7
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_qdgqavzegrGAAjz
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_qdgqavzegrGAAjz
+.L_small_initial_partial_block_qdgqavzegrGAAjz:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_qdgqavzegrGAAjz:
+
+ orq %r8,%r8
+ je .L_after_reduction_qdgqavzegrGAAjz
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_qdgqavzegrGAAjz:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_6_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $250,%r15d
+ jae .L_16_blocks_overflow_AkAddilhnCabyyf
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %ymm27,%ymm0,%ymm3
+ jmp .L_16_blocks_ok_AkAddilhnCabyyf
+
+.L_16_blocks_overflow_AkAddilhnCabyyf:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+.L_16_blocks_ok_AkAddilhnCabyyf:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%ymm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %ymm30,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %ymm31,%ymm3,%ymm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %ymm30,%ymm3,%ymm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %ymm29,%ymm19,%ymm19
+ vextracti32x4 $1,%zmm19,%xmm7
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_iibprCbqDlikAnd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_iibprCbqDlikAnd
+.L_small_initial_partial_block_iibprCbqDlikAnd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm19,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm19,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm19,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm19,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_iibprCbqDlikAnd:
+
+ orq %r8,%r8
+ je .L_after_reduction_iibprCbqDlikAnd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_iibprCbqDlikAnd:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_7_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $249,%r15d
+ jae .L_16_blocks_overflow_lxvhGbsbefzGdxF
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_lxvhGbsbefzGdxF
+
+.L_16_blocks_overflow_lxvhGbsbefzGdxF:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_lxvhGbsbefzGdxF:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $2,%zmm19,%xmm7
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_GthoECEdfcnGsvc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_GthoECEdfcnGsvc
+.L_small_initial_partial_block_GthoECEdfcnGsvc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm19,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm19,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm19,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm19,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_GthoECEdfcnGsvc:
+
+ orq %r8,%r8
+ je .L_after_reduction_GthoECEdfcnGsvc
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_GthoECEdfcnGsvc:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_8_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $64,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $248,%r15d
+ jae .L_16_blocks_overflow_qwiyktwmAFnlrAv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ jmp .L_16_blocks_ok_qwiyktwmAFnlrAv
+
+.L_16_blocks_overflow_qwiyktwmAFnlrAv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+.L_16_blocks_ok_qwiyktwmAFnlrAv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm19,%zmm19{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vextracti32x4 $3,%zmm19,%xmm7
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_hBGcauuiubbhsmg
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_hBGcauuiubbhsmg
+.L_small_initial_partial_block_hBGcauuiubbhsmg:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_hBGcauuiubbhsmg:
+
+ orq %r8,%r8
+ je .L_after_reduction_hBGcauuiubbhsmg
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_hBGcauuiubbhsmg:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_9_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $247,%r15d
+ jae .L_16_blocks_overflow_Aahazrycncacmjd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %xmm27,%xmm3,%xmm4
+ jmp .L_16_blocks_ok_Aahazrycncacmjd
+
+.L_16_blocks_overflow_Aahazrycncacmjd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+.L_16_blocks_ok_Aahazrycncacmjd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%xmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %xmm30,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %xmm31,%xmm4,%xmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %xmm30,%xmm4,%xmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %xmm20,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %xmm29,%xmm20,%xmm20
+ vextracti32x4 $0,%zmm20,%xmm7
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xijDGphAfrrjvcn
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xijDGphAfrrjvcn
+.L_small_initial_partial_block_xijDGphAfrrjvcn:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xijDGphAfrrjvcn:
+
+ orq %r8,%r8
+ je .L_after_reduction_xijDGphAfrrjvcn
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_xijDGphAfrrjvcn:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_10_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $246,%r15d
+ jae .L_16_blocks_overflow_hkbadvpbxvroayG
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %ymm27,%ymm3,%ymm4
+ jmp .L_16_blocks_ok_hkbadvpbxvroayG
+
+.L_16_blocks_overflow_hkbadvpbxvroayG:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+.L_16_blocks_ok_hkbadvpbxvroayG:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%ymm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %ymm30,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %ymm31,%ymm4,%ymm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %ymm30,%ymm4,%ymm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %ymm20,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %ymm29,%ymm20,%ymm20
+ vextracti32x4 $1,%zmm20,%xmm7
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_oahmBbxzjdosefa
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_oahmBbxzjdosefa
+.L_small_initial_partial_block_oahmBbxzjdosefa:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm20,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm20,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm20,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm20,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_oahmBbxzjdosefa:
+
+ orq %r8,%r8
+ je .L_after_reduction_oahmBbxzjdosefa
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_oahmBbxzjdosefa:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_11_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $245,%r15d
+ jae .L_16_blocks_overflow_FsdwrjvehsptDBd
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_FsdwrjvehsptDBd
+
+.L_16_blocks_overflow_FsdwrjvehsptDBd:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_FsdwrjvehsptDBd:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $2,%zmm20,%xmm7
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yodgBeqbEhheCDd
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yodgBeqbEhheCDd
+.L_small_initial_partial_block_yodgBeqbEhheCDd:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm20,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm20,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm20,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm20,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yodgBeqbEhheCDd:
+
+ orq %r8,%r8
+ je .L_after_reduction_yodgBeqbEhheCDd
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_yodgBeqbEhheCDd:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_12_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $128,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $244,%r15d
+ jae .L_16_blocks_overflow_thkeiGylBuuojur
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ jmp .L_16_blocks_ok_thkeiGylBuuojur
+
+.L_16_blocks_overflow_thkeiGylBuuojur:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+.L_16_blocks_ok_thkeiGylBuuojur:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm20,%zmm20{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vextracti32x4 $3,%zmm20,%xmm7
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yzbzfadAzvvaytc
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yzbzfadAzvvaytc
+.L_small_initial_partial_block_yzbzfadAzvvaytc:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vpxorq %zmm8,%zmm0,%zmm8
+ vpxorq %zmm22,%zmm3,%zmm22
+ vpxorq %zmm30,%zmm4,%zmm30
+ vpxorq %zmm31,%zmm5,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yzbzfadAzvvaytc:
+
+ orq %r8,%r8
+ je .L_after_reduction_yzbzfadAzvvaytc
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_yzbzfadAzvvaytc:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_13_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $243,%r15d
+ jae .L_16_blocks_overflow_eFxvoygBEBGohmA
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %xmm27,%xmm4,%xmm5
+ jmp .L_16_blocks_ok_eFxvoygBEBGohmA
+
+.L_16_blocks_overflow_eFxvoygBEBGohmA:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+.L_16_blocks_ok_eFxvoygBEBGohmA:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%xmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %xmm30,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %xmm31,%xmm5,%xmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %xmm30,%xmm5,%xmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %xmm21,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %xmm29,%xmm21,%xmm21
+ vextracti32x4 $0,%zmm21,%xmm7
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_zzewAuyevyjoCwC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_zzewAuyevyjoCwC
+.L_small_initial_partial_block_zzewAuyevyjoCwC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 160(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 224(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 288(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+
+ vpxorq %zmm26,%zmm30,%zmm30
+ vpxorq %zmm24,%zmm8,%zmm8
+ vpxorq %zmm25,%zmm22,%zmm22
+
+ vpxorq %zmm31,%zmm30,%zmm30
+ vpsrldq $8,%zmm30,%zmm4
+ vpslldq $8,%zmm30,%zmm5
+ vpxorq %zmm4,%zmm8,%zmm0
+ vpxorq %zmm5,%zmm22,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_zzewAuyevyjoCwC:
+
+ orq %r8,%r8
+ je .L_after_reduction_zzewAuyevyjoCwC
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_zzewAuyevyjoCwC:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_14_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $242,%r15d
+ jae .L_16_blocks_overflow_wcubmfDtExvnDlb
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %ymm27,%ymm4,%ymm5
+ jmp .L_16_blocks_ok_wcubmfDtExvnDlb
+
+.L_16_blocks_overflow_wcubmfDtExvnDlb:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+.L_16_blocks_ok_wcubmfDtExvnDlb:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%ymm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %ymm30,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %ymm31,%ymm5,%ymm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %ymm30,%ymm5,%ymm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %ymm21,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %ymm29,%ymm21,%ymm21
+ vextracti32x4 $1,%zmm21,%xmm7
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_wbcvGrEDxndwxqw
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_wbcvGrEDxndwxqw
+.L_small_initial_partial_block_wbcvGrEDxndwxqw:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 144(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 208(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 272(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 336(%rsi),%xmm1
+ vpclmulqdq $0x01,%xmm1,%xmm21,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm21,%xmm5
+ vpclmulqdq $0x11,%xmm1,%xmm21,%xmm0
+ vpclmulqdq $0x00,%xmm1,%xmm21,%xmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wbcvGrEDxndwxqw:
+
+ orq %r8,%r8
+ je .L_after_reduction_wbcvGrEDxndwxqw
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wbcvGrEDxndwxqw:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_15_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $241,%r15d
+ jae .L_16_blocks_overflow_hDvByfpahyymzEv
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_hDvByfpahyymzEv
+
+.L_16_blocks_overflow_hDvByfpahyymzEv:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_hDvByfpahyymzEv:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $2,%zmm21,%xmm7
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_uAckhsjfbEBxdkE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_uAckhsjfbEBxdkE
+.L_small_initial_partial_block_uAckhsjfbEBxdkE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 128(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 192(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 256(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 320(%rsi),%ymm1
+ vpclmulqdq $0x01,%ymm1,%ymm21,%ymm4
+ vpclmulqdq $0x10,%ymm1,%ymm21,%ymm5
+ vpclmulqdq $0x11,%ymm1,%ymm21,%ymm0
+ vpclmulqdq $0x00,%ymm1,%ymm21,%ymm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_uAckhsjfbEBxdkE:
+
+ orq %r8,%r8
+ je .L_after_reduction_uAckhsjfbEBxdkE
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_uAckhsjfbEBxdkE:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_16_qdswuDcxyhGmasp:
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%rax
+ subq $192,%rax
+ kmovq (%r10,%rax,8),%k1
+ cmpl $240,%r15d
+ jae .L_16_blocks_overflow_rnhelBbtegFkzjj
+ vpaddd %zmm28,%zmm2,%zmm0
+ vpaddd %zmm27,%zmm0,%zmm3
+ vpaddd %zmm27,%zmm3,%zmm4
+ vpaddd %zmm27,%zmm4,%zmm5
+ jmp .L_16_blocks_ok_rnhelBbtegFkzjj
+
+.L_16_blocks_overflow_rnhelBbtegFkzjj:
+ vpshufb %zmm29,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vmovdqa64 ddq_add_4444(%rip),%zmm5
+ vpaddd %zmm5,%zmm0,%zmm3
+ vpaddd %zmm5,%zmm3,%zmm4
+ vpaddd %zmm5,%zmm4,%zmm5
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+.L_16_blocks_ok_rnhelBbtegFkzjj:
+
+
+
+
+ vbroadcastf64x2 0(%rdi),%zmm30
+ vpxorq 768(%rsp),%zmm14,%zmm8
+ vmovdqu64 0(%rsp,%rbx,1),%zmm1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+
+
+ vbroadcastf64x2 16(%rdi),%zmm31
+ vmovdqu64 64(%rsp,%rbx,1),%zmm18
+ vmovdqa64 832(%rsp),%zmm22
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm30,%zmm3,%zmm3
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpxorq %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm30
+
+
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm14
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm7
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm10
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm11
+ vmovdqu64 128(%rsp,%rbx,1),%zmm1
+ vmovdqa64 896(%rsp),%zmm8
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm18
+ vmovdqa64 960(%rsp),%zmm22
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm30
+
+
+ vpclmulqdq $0x10,%zmm1,%zmm8,%zmm20
+ vpclmulqdq $0x01,%zmm1,%zmm8,%zmm21
+ vpclmulqdq $0x11,%zmm1,%zmm8,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm8,%zmm19
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm31
+
+
+ vpternlogq $0x96,%zmm17,%zmm12,%zmm14
+ vpternlogq $0x96,%zmm19,%zmm13,%zmm7
+ vpternlogq $0x96,%zmm21,%zmm16,%zmm11
+ vpternlogq $0x96,%zmm20,%zmm15,%zmm10
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm30
+ vmovdqu8 0(%rcx,%r11,1),%zmm17
+ vmovdqu8 64(%rcx,%r11,1),%zmm19
+ vmovdqu8 128(%rcx,%r11,1),%zmm20
+ vmovdqu8 192(%rcx,%r11,1),%zmm21{%k1}{z}
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm31
+
+
+ vpclmulqdq $0x10,%zmm18,%zmm22,%zmm15
+ vpclmulqdq $0x01,%zmm18,%zmm22,%zmm16
+ vpclmulqdq $0x11,%zmm18,%zmm22,%zmm12
+ vpclmulqdq $0x00,%zmm18,%zmm22,%zmm13
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm30
+ vpternlogq $0x96,%zmm16,%zmm11,%zmm10
+ vpxorq %zmm12,%zmm14,%zmm24
+ vpxorq %zmm13,%zmm7,%zmm25
+ vpxorq %zmm15,%zmm10,%zmm26
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm31
+ vaesenc %zmm30,%zmm0,%zmm0
+ vaesenc %zmm30,%zmm3,%zmm3
+ vaesenc %zmm30,%zmm4,%zmm4
+ vaesenc %zmm30,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm30
+ vaesenc %zmm31,%zmm0,%zmm0
+ vaesenc %zmm31,%zmm3,%zmm3
+ vaesenc %zmm31,%zmm4,%zmm4
+ vaesenc %zmm31,%zmm5,%zmm5
+ vaesenclast %zmm30,%zmm0,%zmm0
+ vaesenclast %zmm30,%zmm3,%zmm3
+ vaesenclast %zmm30,%zmm4,%zmm4
+ vaesenclast %zmm30,%zmm5,%zmm5
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vpxorq %zmm20,%zmm4,%zmm4
+ vpxorq %zmm21,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm11
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm21,%zmm21{%k1}{z}
+ vpshufb %zmm29,%zmm17,%zmm17
+ vpshufb %zmm29,%zmm19,%zmm19
+ vpshufb %zmm29,%zmm20,%zmm20
+ vpshufb %zmm29,%zmm21,%zmm21
+ vextracti32x4 $3,%zmm21,%xmm7
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_wEgqnyhjgyEjfkm:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm11,16(%rsi)
+ vmovdqu64 112(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm17,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm17,%zmm3
+ vpclmulqdq $0x01,%zmm1,%zmm17,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm17,%zmm5
+ vmovdqu64 176(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm19,%zmm8
+ vpclmulqdq $0x00,%zmm1,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm19,%zmm30
+ vpclmulqdq $0x10,%zmm1,%zmm19,%zmm31
+ vmovdqu64 240(%rsi),%zmm1
+ vpclmulqdq $0x11,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x00,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm0,%zmm17,%zmm8
+ vpternlogq $0x96,%zmm3,%zmm19,%zmm22
+ vpclmulqdq $0x01,%zmm1,%zmm20,%zmm17
+ vpclmulqdq $0x10,%zmm1,%zmm20,%zmm19
+ vpternlogq $0x96,%zmm4,%zmm17,%zmm30
+ vpternlogq $0x96,%zmm5,%zmm19,%zmm31
+ vmovdqu64 304(%rsi),%ymm1
+ vinserti64x2 $2,336(%rsi),%zmm1,%zmm1
+ vpclmulqdq $0x01,%zmm1,%zmm21,%zmm4
+ vpclmulqdq $0x10,%zmm1,%zmm21,%zmm5
+ vpclmulqdq $0x11,%zmm1,%zmm21,%zmm0
+ vpclmulqdq $0x00,%zmm1,%zmm21,%zmm3
+
+ vpxorq %zmm30,%zmm4,%zmm4
+ vpternlogq $0x96,%zmm31,%zmm26,%zmm5
+ vpternlogq $0x96,%zmm8,%zmm24,%zmm0
+ vpternlogq $0x96,%zmm22,%zmm25,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm30
+ vpslldq $8,%zmm4,%zmm31
+ vpxorq %zmm30,%zmm0,%zmm0
+ vpxorq %zmm31,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm30
+ vpxorq %ymm30,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm30
+ vpxorq %xmm30,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm31
+ vpxorq %ymm31,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm31
+ vpxorq %xmm31,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm1
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm1,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm1,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm1,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_wEgqnyhjgyEjfkm:
+ vpxorq %xmm7,%xmm14,%xmm14
+.L_after_reduction_wEgqnyhjgyEjfkm:
+ jmp .L_last_blocks_done_qdswuDcxyhGmasp
+.L_last_num_blocks_is_0_qdswuDcxyhGmasp:
+ vmovdqa64 768(%rsp),%zmm13
+ vpxorq %zmm14,%zmm13,%zmm13
+ vmovdqu64 0(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 832(%rsp),%zmm13
+ vmovdqu64 64(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+ vpxorq %zmm10,%zmm4,%zmm26
+ vpxorq %zmm6,%zmm0,%zmm24
+ vpxorq %zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+ vmovdqa64 896(%rsp),%zmm13
+ vmovdqu64 128(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm0
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm3
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm4
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm5
+ vmovdqa64 960(%rsp),%zmm13
+ vmovdqu64 192(%rsp,%rbx,1),%zmm12
+ vpclmulqdq $0x11,%zmm12,%zmm13,%zmm6
+ vpclmulqdq $0x00,%zmm12,%zmm13,%zmm7
+ vpclmulqdq $0x01,%zmm12,%zmm13,%zmm10
+ vpclmulqdq $0x10,%zmm12,%zmm13,%zmm11
+
+ vpternlogq $0x96,%zmm10,%zmm4,%zmm26
+ vpternlogq $0x96,%zmm6,%zmm0,%zmm24
+ vpternlogq $0x96,%zmm7,%zmm3,%zmm25
+ vpternlogq $0x96,%zmm11,%zmm5,%zmm26
+
+ vpsrldq $8,%zmm26,%zmm0
+ vpslldq $8,%zmm26,%zmm3
+ vpxorq %zmm0,%zmm24,%zmm24
+ vpxorq %zmm3,%zmm25,%zmm25
+ vextracti64x4 $1,%zmm24,%ymm0
+ vpxorq %ymm0,%ymm24,%ymm24
+ vextracti32x4 $1,%ymm24,%xmm0
+ vpxorq %xmm0,%xmm24,%xmm24
+ vextracti64x4 $1,%zmm25,%ymm3
+ vpxorq %ymm3,%ymm25,%ymm25
+ vextracti32x4 $1,%ymm25,%xmm3
+ vpxorq %xmm3,%xmm25,%xmm25
+ vmovdqa64 POLY2(%rip),%xmm4
+
+
+ vpclmulqdq $0x01,%xmm25,%xmm4,%xmm0
+ vpslldq $8,%xmm0,%xmm0
+ vpxorq %xmm0,%xmm25,%xmm0
+
+
+ vpclmulqdq $0x00,%xmm0,%xmm4,%xmm3
+ vpsrldq $4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm0,%xmm4,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm24,%xmm3,%xmm14
+
+.L_last_blocks_done_qdswuDcxyhGmasp:
+ vpshufb %xmm29,%xmm2,%xmm2
+ jmp .L_ghash_done_kgypzeldFqsBnqw
+
+.L_message_below_equal_16_blocks_kgypzeldFqsBnqw:
+
+
+ movl %r8d,%r12d
+ addl $15,%r12d
+ shrl $4,%r12d
+ cmpq $8,%r12
+ je .L_small_initial_num_blocks_is_8_uBFzjxzanCsxGGe
+ jl .L_small_initial_num_blocks_is_7_1_uBFzjxzanCsxGGe
+
+
+ cmpq $12,%r12
+ je .L_small_initial_num_blocks_is_12_uBFzjxzanCsxGGe
+ jl .L_small_initial_num_blocks_is_11_9_uBFzjxzanCsxGGe
+
+
+ cmpq $16,%r12
+ je .L_small_initial_num_blocks_is_16_uBFzjxzanCsxGGe
+ cmpq $15,%r12
+ je .L_small_initial_num_blocks_is_15_uBFzjxzanCsxGGe
+ cmpq $14,%r12
+ je .L_small_initial_num_blocks_is_14_uBFzjxzanCsxGGe
+ jmp .L_small_initial_num_blocks_is_13_uBFzjxzanCsxGGe
+
+.L_small_initial_num_blocks_is_11_9_uBFzjxzanCsxGGe:
+
+ cmpq $11,%r12
+ je .L_small_initial_num_blocks_is_11_uBFzjxzanCsxGGe
+ cmpq $10,%r12
+ je .L_small_initial_num_blocks_is_10_uBFzjxzanCsxGGe
+ jmp .L_small_initial_num_blocks_is_9_uBFzjxzanCsxGGe
+
+.L_small_initial_num_blocks_is_7_1_uBFzjxzanCsxGGe:
+ cmpq $4,%r12
+ je .L_small_initial_num_blocks_is_4_uBFzjxzanCsxGGe
+ jl .L_small_initial_num_blocks_is_3_1_uBFzjxzanCsxGGe
+
+ cmpq $7,%r12
+ je .L_small_initial_num_blocks_is_7_uBFzjxzanCsxGGe
+ cmpq $6,%r12
+ je .L_small_initial_num_blocks_is_6_uBFzjxzanCsxGGe
+ jmp .L_small_initial_num_blocks_is_5_uBFzjxzanCsxGGe
+
+.L_small_initial_num_blocks_is_3_1_uBFzjxzanCsxGGe:
+
+ cmpq $3,%r12
+ je .L_small_initial_num_blocks_is_3_uBFzjxzanCsxGGe
+ cmpq $2,%r12
+ je .L_small_initial_num_blocks_is_2_uBFzjxzanCsxGGe
+
+
+
+
+
+.L_small_initial_num_blocks_is_1_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%xmm29
+ vpaddd ONE(%rip),%xmm2,%xmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm0,%xmm2
+ vpshufb %xmm29,%xmm0,%xmm0
+ vmovdqu8 0(%rcx,%r11,1),%xmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %xmm15,%xmm0,%xmm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %xmm15,%xmm0,%xmm0
+ vpxorq %xmm6,%xmm0,%xmm0
+ vextracti32x4 $0,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %xmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %xmm29,%xmm6,%xmm6
+ vextracti32x4 $0,%zmm6,%xmm13
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_usvkeoywsioAnfD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_usvkeoywsioAnfD
+.L_small_initial_partial_block_usvkeoywsioAnfD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+
+
+
+
+
+
+
+
+
+
+
+ vpxorq %xmm13,%xmm14,%xmm14
+
+ jmp .L_after_reduction_usvkeoywsioAnfD
+.L_small_initial_compute_done_usvkeoywsioAnfD:
+.L_after_reduction_usvkeoywsioAnfD:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_2_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%ymm29
+ vshufi64x2 $0,%ymm2,%ymm2,%ymm0
+ vpaddd ddq_add_1234(%rip),%ymm0,%ymm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm0,%xmm2
+ vpshufb %ymm29,%ymm0,%ymm0
+ vmovdqu8 0(%rcx,%r11,1),%ymm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %ymm15,%ymm0,%ymm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %ymm15,%ymm0,%ymm0
+ vpxorq %ymm6,%ymm0,%ymm0
+ vextracti32x4 $1,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %ymm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %ymm29,%ymm6,%ymm6
+ vextracti32x4 $1,%zmm6,%xmm13
+ subq $16 * (2 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_yvjeqFrhsrkxcss
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_yvjeqFrhsrkxcss
+.L_small_initial_partial_block_yvjeqFrhsrkxcss:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm6,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm6,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm6,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm6,%xmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_yvjeqFrhsrkxcss:
+
+ orq %r8,%r8
+ je .L_after_reduction_yvjeqFrhsrkxcss
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_yvjeqFrhsrkxcss:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_3_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $2,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vextracti32x4 $2,%zmm6,%xmm13
+ subq $16 * (3 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mvdynCrzwGwegAr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mvdynCrzwGwegAr
+.L_small_initial_partial_block_mvdynCrzwGwegAr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm6,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm6,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm6,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm6,%ymm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mvdynCrzwGwegAr:
+
+ orq %r8,%r8
+ je .L_after_reduction_mvdynCrzwGwegAr
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_mvdynCrzwGwegAr:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_4_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm0,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vmovdqu8 0(%rcx,%r11,1),%zmm6{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vpxorq %zmm6,%zmm0,%zmm0
+ vextracti32x4 $3,%zmm0,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm0,%zmm0{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vextracti32x4 $3,%zmm6,%xmm13
+ subq $16 * (4 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_pjDzAfyivuABgdr
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_pjDzAfyivuABgdr
+.L_small_initial_partial_block_pjDzAfyivuABgdr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_pjDzAfyivuABgdr:
+
+ orq %r8,%r8
+ je .L_after_reduction_pjDzAfyivuABgdr
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_pjDzAfyivuABgdr:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_5_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %xmm29,%xmm3,%xmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%xmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %xmm15,%xmm3,%xmm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %xmm7,%xmm3,%xmm3
+ vextracti32x4 $0,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %xmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %xmm29,%xmm7,%xmm7
+ vextracti32x4 $0,%zmm7,%xmm13
+ subq $16 * (5 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fcBludqftzBwbAa
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fcBludqftzBwbAa
+.L_small_initial_partial_block_fcBludqftzBwbAa:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fcBludqftzBwbAa:
+
+ orq %r8,%r8
+ je .L_after_reduction_fcBludqftzBwbAa
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_fcBludqftzBwbAa:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_6_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %ymm29,%ymm3,%ymm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%ymm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %ymm15,%ymm3,%ymm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %ymm15,%ymm3,%ymm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %ymm7,%ymm3,%ymm3
+ vextracti32x4 $1,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %ymm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %ymm29,%ymm7,%ymm7
+ vextracti32x4 $1,%zmm7,%xmm13
+ subq $16 * (6 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_gpklsvBmbaGumBx
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_gpklsvBmbaGumBx
+.L_small_initial_partial_block_gpklsvBmbaGumBx:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm7,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm7,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm7,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm7,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_gpklsvBmbaGumBx:
+
+ orq %r8,%r8
+ je .L_after_reduction_gpklsvBmbaGumBx
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_gpklsvBmbaGumBx:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_7_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $2,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vextracti32x4 $2,%zmm7,%xmm13
+ subq $16 * (7 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_fFxDDorEtzfbsCi
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_fFxDDorEtzfbsCi
+.L_small_initial_partial_block_fFxDDorEtzfbsCi:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm7,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm7,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm7,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm7,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_fFxDDorEtzfbsCi:
+
+ orq %r8,%r8
+ je .L_after_reduction_fFxDDorEtzfbsCi
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_fFxDDorEtzfbsCi:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_8_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $64,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm3,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vextracti32x4 $3,%zmm3,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm3,%zmm3{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vextracti32x4 $3,%zmm7,%xmm13
+ subq $16 * (8 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_mhgromrjcFpqAxA
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_mhgromrjcFpqAxA
+.L_small_initial_partial_block_mhgromrjcFpqAxA:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_mhgromrjcFpqAxA:
+
+ orq %r8,%r8
+ je .L_after_reduction_mhgromrjcFpqAxA
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_mhgromrjcFpqAxA:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_9_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %xmm29,%xmm4,%xmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%xmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %xmm10,%xmm4,%xmm4
+ vextracti32x4 $0,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %xmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %xmm29,%xmm10,%xmm10
+ vextracti32x4 $0,%zmm10,%xmm13
+ subq $16 * (9 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_eghzedifwilpnEF
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_eghzedifwilpnEF
+.L_small_initial_partial_block_eghzedifwilpnEF:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_eghzedifwilpnEF:
+
+ orq %r8,%r8
+ je .L_after_reduction_eghzedifwilpnEF
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_eghzedifwilpnEF:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_10_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %ymm29,%ymm4,%ymm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%ymm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %ymm15,%ymm4,%ymm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %ymm15,%ymm4,%ymm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %ymm10,%ymm4,%ymm4
+ vextracti32x4 $1,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %ymm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %ymm29,%ymm10,%ymm10
+ vextracti32x4 $1,%zmm10,%xmm13
+ subq $16 * (10 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_aBEqcFFmwBplgFE
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_aBEqcFFmwBplgFE
+.L_small_initial_partial_block_aBEqcFFmwBplgFE:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm10,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm10,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm10,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm10,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_aBEqcFFmwBplgFE:
+
+ orq %r8,%r8
+ je .L_after_reduction_aBEqcFFmwBplgFE
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_aBEqcFFmwBplgFE:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_11_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $2,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vextracti32x4 $2,%zmm10,%xmm13
+ subq $16 * (11 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_ozteDdAwrbobDia
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_ozteDdAwrbobDia
+.L_small_initial_partial_block_ozteDdAwrbobDia:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm10,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm10,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm10,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm10,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_ozteDdAwrbobDia:
+
+ orq %r8,%r8
+ je .L_after_reduction_ozteDdAwrbobDia
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_ozteDdAwrbobDia:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_12_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $128,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm4,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vextracti32x4 $3,%zmm4,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm4,%zmm4{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vextracti32x4 $3,%zmm10,%xmm13
+ subq $16 * (12 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_xaldGCCAFmcudnD
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_xaldGCCAFmcudnD
+.L_small_initial_partial_block_xaldGCCAFmcudnD:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vpxorq %zmm15,%zmm0,%zmm15
+ vpxorq %zmm16,%zmm3,%zmm16
+ vpxorq %zmm17,%zmm4,%zmm17
+ vpxorq %zmm19,%zmm5,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_xaldGCCAFmcudnD:
+
+ orq %r8,%r8
+ je .L_after_reduction_xaldGCCAFmcudnD
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_xaldGCCAFmcudnD:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_13_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $0,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %xmm29,%xmm5,%xmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%xmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %xmm11,%xmm5,%xmm5
+ vextracti32x4 $0,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %xmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %xmm29,%xmm11,%xmm11
+ vextracti32x4 $0,%zmm11,%xmm13
+ subq $16 * (13 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_txhExvepwglFbiC
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_txhExvepwglFbiC
+.L_small_initial_partial_block_txhExvepwglFbiC:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 160(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 224(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 288(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+
+ vpxorq %zmm19,%zmm17,%zmm17
+ vpsrldq $8,%zmm17,%zmm4
+ vpslldq $8,%zmm17,%zmm5
+ vpxorq %zmm4,%zmm15,%zmm0
+ vpxorq %zmm5,%zmm16,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_txhExvepwglFbiC:
+
+ orq %r8,%r8
+ je .L_after_reduction_txhExvepwglFbiC
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_txhExvepwglFbiC:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_14_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $1,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %ymm29,%ymm5,%ymm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%ymm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %ymm15,%ymm5,%ymm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %ymm15,%ymm5,%ymm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %ymm11,%ymm5,%ymm5
+ vextracti32x4 $1,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %ymm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %ymm29,%ymm11,%ymm11
+ vextracti32x4 $1,%zmm11,%xmm13
+ subq $16 * (14 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_usDayEFvfwmlydb
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_usDayEFvfwmlydb
+.L_small_initial_partial_block_usDayEFvfwmlydb:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 144(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 208(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 272(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 336(%rsi),%xmm20
+ vpclmulqdq $0x01,%xmm20,%xmm11,%xmm4
+ vpclmulqdq $0x10,%xmm20,%xmm11,%xmm5
+ vpclmulqdq $0x11,%xmm20,%xmm11,%xmm0
+ vpclmulqdq $0x00,%xmm20,%xmm11,%xmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_usDayEFvfwmlydb:
+
+ orq %r8,%r8
+ je .L_after_reduction_usDayEFvfwmlydb
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_usDayEFvfwmlydb:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_15_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $2,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $2,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vextracti32x4 $2,%zmm11,%xmm13
+ subq $16 * (15 - 1),%r8
+
+
+ cmpq $16,%r8
+ jl .L_small_initial_partial_block_DrCACnmarBwymye
+
+
+
+
+
+ subq $16,%r8
+ movq $0,(%rdx)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+ jmp .L_small_initial_compute_done_DrCACnmarBwymye
+.L_small_initial_partial_block_DrCACnmarBwymye:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 128(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 192(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 256(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 320(%rsi),%ymm20
+ vpclmulqdq $0x01,%ymm20,%ymm11,%ymm4
+ vpclmulqdq $0x10,%ymm20,%ymm11,%ymm5
+ vpclmulqdq $0x11,%ymm20,%ymm11,%ymm0
+ vpclmulqdq $0x00,%ymm20,%ymm11,%ymm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_DrCACnmarBwymye:
+
+ orq %r8,%r8
+ je .L_after_reduction_DrCACnmarBwymye
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_DrCACnmarBwymye:
+ jmp .L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe
+.L_small_initial_num_blocks_is_16_uBFzjxzanCsxGGe:
+ vmovdqa64 SHUF_MASK(%rip),%zmm29
+ vshufi64x2 $0,%zmm2,%zmm2,%zmm2
+ vpaddd ddq_add_1234(%rip),%zmm2,%zmm0
+ vpaddd ddq_add_5678(%rip),%zmm2,%zmm3
+ vpaddd ddq_add_8888(%rip),%zmm0,%zmm4
+ vpaddd ddq_add_8888(%rip),%zmm3,%zmm5
+ leaq byte64_len_to_mask_table(%rip),%r10
+ movq %r8,%r15
+ subq $192,%r15
+ kmovq (%r10,%r15,8),%k1
+ vextracti32x4 $3,%zmm5,%xmm2
+ vpshufb %zmm29,%zmm0,%zmm0
+ vpshufb %zmm29,%zmm3,%zmm3
+ vpshufb %zmm29,%zmm4,%zmm4
+ vpshufb %zmm29,%zmm5,%zmm5
+ vmovdqu8 0(%rcx,%r11,1),%zmm6
+ vmovdqu8 64(%rcx,%r11,1),%zmm7
+ vmovdqu8 128(%rcx,%r11,1),%zmm10
+ vmovdqu8 192(%rcx,%r11,1),%zmm11{%k1}{z}
+ vbroadcastf64x2 0(%rdi),%zmm15
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm15,%zmm3,%zmm3
+ vpxorq %zmm15,%zmm4,%zmm4
+ vpxorq %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 16(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 32(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 48(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 64(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 80(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 96(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 112(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 128(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 144(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 160(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 176(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 192(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 208(%rdi),%zmm15
+ vaesenc %zmm15,%zmm0,%zmm0
+ vaesenc %zmm15,%zmm3,%zmm3
+ vaesenc %zmm15,%zmm4,%zmm4
+ vaesenc %zmm15,%zmm5,%zmm5
+ vbroadcastf64x2 224(%rdi),%zmm15
+ vaesenclast %zmm15,%zmm0,%zmm0
+ vaesenclast %zmm15,%zmm3,%zmm3
+ vaesenclast %zmm15,%zmm4,%zmm4
+ vaesenclast %zmm15,%zmm5,%zmm5
+ vpxorq %zmm6,%zmm0,%zmm0
+ vpxorq %zmm7,%zmm3,%zmm3
+ vpxorq %zmm10,%zmm4,%zmm4
+ vpxorq %zmm11,%zmm5,%zmm5
+ vextracti32x4 $3,%zmm5,%xmm12
+ movq %r9,%r10
+ vmovdqu8 %zmm0,0(%r10,%r11,1)
+ vmovdqu8 %zmm3,64(%r10,%r11,1)
+ vmovdqu8 %zmm4,128(%r10,%r11,1)
+ vmovdqu8 %zmm5,192(%r10,%r11,1){%k1}
+ vmovdqu8 %zmm5,%zmm5{%k1}{z}
+ vpshufb %zmm29,%zmm6,%zmm6
+ vpshufb %zmm29,%zmm7,%zmm7
+ vpshufb %zmm29,%zmm10,%zmm10
+ vpshufb %zmm29,%zmm11,%zmm11
+ vextracti32x4 $3,%zmm11,%xmm13
+ subq $16 * (16 - 1),%r8
+.L_small_initial_partial_block_khwfpcqckgAmFnr:
+
+
+
+
+
+
+
+
+ movq %r8,(%rdx)
+ vmovdqu64 %xmm12,16(%rsi)
+ vpxorq %zmm14,%zmm6,%zmm6
+ vmovdqu64 112(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm6,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm6,%zmm3
+ vpclmulqdq $0x01,%zmm20,%zmm6,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm6,%zmm5
+ vmovdqu64 176(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm7,%zmm15
+ vpclmulqdq $0x00,%zmm20,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm7,%zmm17
+ vpclmulqdq $0x10,%zmm20,%zmm7,%zmm19
+ vmovdqu64 240(%rsi),%zmm20
+ vpclmulqdq $0x11,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x00,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm0,%zmm6,%zmm15
+ vpternlogq $0x96,%zmm3,%zmm7,%zmm16
+ vpclmulqdq $0x01,%zmm20,%zmm10,%zmm6
+ vpclmulqdq $0x10,%zmm20,%zmm10,%zmm7
+ vpternlogq $0x96,%zmm4,%zmm6,%zmm17
+ vpternlogq $0x96,%zmm5,%zmm7,%zmm19
+ vmovdqu64 304(%rsi),%ymm20
+ vinserti64x2 $2,336(%rsi),%zmm20,%zmm20
+ vpclmulqdq $0x01,%zmm20,%zmm11,%zmm4
+ vpclmulqdq $0x10,%zmm20,%zmm11,%zmm5
+ vpclmulqdq $0x11,%zmm20,%zmm11,%zmm0
+ vpclmulqdq $0x00,%zmm20,%zmm11,%zmm3
+
+ vpxorq %zmm17,%zmm4,%zmm4
+ vpxorq %zmm19,%zmm5,%zmm5
+ vpxorq %zmm15,%zmm0,%zmm0
+ vpxorq %zmm16,%zmm3,%zmm3
+
+ vpxorq %zmm5,%zmm4,%zmm4
+ vpsrldq $8,%zmm4,%zmm17
+ vpslldq $8,%zmm4,%zmm19
+ vpxorq %zmm17,%zmm0,%zmm0
+ vpxorq %zmm19,%zmm3,%zmm3
+ vextracti64x4 $1,%zmm0,%ymm17
+ vpxorq %ymm17,%ymm0,%ymm0
+ vextracti32x4 $1,%ymm0,%xmm17
+ vpxorq %xmm17,%xmm0,%xmm0
+ vextracti64x4 $1,%zmm3,%ymm19
+ vpxorq %ymm19,%ymm3,%ymm3
+ vextracti32x4 $1,%ymm3,%xmm19
+ vpxorq %xmm19,%xmm3,%xmm3
+ vmovdqa64 POLY2(%rip),%xmm20
+
+
+ vpclmulqdq $0x01,%xmm3,%xmm20,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm3,%xmm4
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm20,%xmm5
+ vpsrldq $4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm4,%xmm20,%xmm14
+ vpslldq $4,%xmm14,%xmm14
+ vpternlogq $0x96,%xmm0,%xmm5,%xmm14
+
+.L_small_initial_compute_done_khwfpcqckgAmFnr:
+ vpxorq %xmm13,%xmm14,%xmm14
+.L_after_reduction_khwfpcqckgAmFnr:
+.L_small_initial_blocks_encrypted_uBFzjxzanCsxGGe:
+.L_ghash_done_kgypzeldFqsBnqw:
+ vmovdqu64 %xmm2,0(%rsi)
+ vmovdqu64 %xmm14,64(%rsi)
+.L_enc_dec_done_kgypzeldFqsBnqw:
+ jmp .Lexit_gcm_decrypt
+.Lexit_gcm_decrypt:
+ cmpq $256,%r8
+ jbe .Lskip_hkeys_cleanup_cdrboBdzwmggbeq
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa64 %zmm0,0(%rsp)
+ vmovdqa64 %zmm0,64(%rsp)
+ vmovdqa64 %zmm0,128(%rsp)
+ vmovdqa64 %zmm0,192(%rsp)
+ vmovdqa64 %zmm0,256(%rsp)
+ vmovdqa64 %zmm0,320(%rsp)
+ vmovdqa64 %zmm0,384(%rsp)
+ vmovdqa64 %zmm0,448(%rsp)
+ vmovdqa64 %zmm0,512(%rsp)
+ vmovdqa64 %zmm0,576(%rsp)
+ vmovdqa64 %zmm0,640(%rsp)
+ vmovdqa64 %zmm0,704(%rsp)
+.Lskip_hkeys_cleanup_cdrboBdzwmggbeq:
+ vzeroupper
+ leaq (%rbp),%rsp
+.cfi_def_cfa_register %rsp
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ .byte 0xf3,0xc3
+.Ldecrypt_seh_end:
+.cfi_endproc
+.size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
+.globl ossl_aes_gcm_finalize_avx512
+.type ossl_aes_gcm_finalize_avx512,@function
+.align 32
+ossl_aes_gcm_finalize_avx512:
+.cfi_startproc
+.byte 243,15,30,250
+ vmovdqu 336(%rdi),%xmm2
+ vmovdqu 32(%rdi),%xmm3
+ vmovdqu 64(%rdi),%xmm4
+
+
+ cmpq $0,%rsi
+ je .L_partial_done_sAyBcyeiDCmpxul
+
+ vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0
+ vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16
+ vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17
+ vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4
+ vpxorq %xmm17,%xmm4,%xmm4
+
+ vpsrldq $8,%xmm4,%xmm17
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm17,%xmm0,%xmm0
+ vpxorq %xmm16,%xmm4,%xmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm17
+
+ vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16
+ vpslldq $8,%xmm16,%xmm16
+ vpxorq %xmm16,%xmm4,%xmm4
+
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16
+ vpsrldq $4,%xmm16,%xmm16
+ vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4
+ vpslldq $4,%xmm4,%xmm4
+
+ vpternlogq $0x96,%xmm16,%xmm0,%xmm4
+
+.L_partial_done_sAyBcyeiDCmpxul:
+ vmovq 56(%rdi),%xmm5
+ vpinsrq $1,48(%rdi),%xmm5,%xmm5
+ vpsllq $3,%xmm5,%xmm5
+
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpclmulqdq $0x11,%xmm2,%xmm4,%xmm0
+ vpclmulqdq $0x00,%xmm2,%xmm4,%xmm16
+ vpclmulqdq $0x01,%xmm2,%xmm4,%xmm17
+ vpclmulqdq $0x10,%xmm2,%xmm4,%xmm4
+ vpxorq %xmm17,%xmm4,%xmm4
+
+ vpsrldq $8,%xmm4,%xmm17
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm17,%xmm0,%xmm0
+ vpxorq %xmm16,%xmm4,%xmm4
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm17
+
+ vpclmulqdq $0x01,%xmm4,%xmm17,%xmm16
+ vpslldq $8,%xmm16,%xmm16
+ vpxorq %xmm16,%xmm4,%xmm4
+
+
+
+ vpclmulqdq $0x00,%xmm4,%xmm17,%xmm16
+ vpsrldq $4,%xmm16,%xmm16
+ vpclmulqdq $0x10,%xmm4,%xmm17,%xmm4
+ vpslldq $4,%xmm4,%xmm4
+
+ vpternlogq $0x96,%xmm16,%xmm0,%xmm4
+
+ vpshufb SHUF_MASK(%rip),%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+.L_return_T_sAyBcyeiDCmpxul:
+ vmovdqu %xmm3,64(%rdi)
+.Labort_finalize:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
+.globl ossl_gcm_gmult_avx512
+.hidden ossl_gcm_gmult_avx512
+.type ossl_gcm_gmult_avx512,@function
+.align 32
+ossl_gcm_gmult_avx512:
+.cfi_startproc
+.byte 243,15,30,250
+ vmovdqu64 (%rdi),%xmm1
+ vmovdqu64 336(%rsi),%xmm2
+
+ vpclmulqdq $0x11,%xmm2,%xmm1,%xmm3
+ vpclmulqdq $0x00,%xmm2,%xmm1,%xmm4
+ vpclmulqdq $0x01,%xmm2,%xmm1,%xmm5
+ vpclmulqdq $0x10,%xmm2,%xmm1,%xmm1
+ vpxorq %xmm5,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm5
+ vpslldq $8,%xmm1,%xmm1
+ vpxorq %xmm5,%xmm3,%xmm3
+ vpxorq %xmm4,%xmm1,%xmm1
+
+
+
+ vmovdqu64 POLY2(%rip),%xmm5
+
+ vpclmulqdq $0x01,%xmm1,%xmm5,%xmm4
+ vpslldq $8,%xmm4,%xmm4
+ vpxorq %xmm4,%xmm1,%xmm1
+
+
+
+ vpclmulqdq $0x00,%xmm1,%xmm5,%xmm4
+ vpsrldq $4,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm1,%xmm5,%xmm1
+ vpslldq $4,%xmm1,%xmm1
+
+ vpternlogq $0x96,%xmm4,%xmm3,%xmm1
+
+ vmovdqu64 %xmm1,(%rdi)
+ vzeroupper
+.Labort_gmult:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
+.data
+.align 16
+POLY:.quad 0x0000000000000001, 0xC200000000000000
+
+.align 64
+POLY2:
+.quad 0x00000001C2000000, 0xC200000000000000
+.quad 0x00000001C2000000, 0xC200000000000000
+.quad 0x00000001C2000000, 0xC200000000000000
+.quad 0x00000001C2000000, 0xC200000000000000
+
+.align 16
+TWOONE:.quad 0x0000000000000001, 0x0000000100000000
+
+
+
+.align 64
+SHUF_MASK:
+.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
+.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
+.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
+.quad 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+.align 16
+SHIFT_MASK:
+.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
+
+ALL_F:
+.quad 0xffffffffffffffff, 0xffffffffffffffff
+
+ZERO:
+.quad 0x0000000000000000, 0x0000000000000000
+
+.align 16
+ONE:
+.quad 0x0000000000000001, 0x0000000000000000
+
+.align 16
+ONEf:
+.quad 0x0000000000000000, 0x0100000000000000
+
+.align 64
+ddq_add_1234:
+.quad 0x0000000000000001, 0x0000000000000000
+.quad 0x0000000000000002, 0x0000000000000000
+.quad 0x0000000000000003, 0x0000000000000000
+.quad 0x0000000000000004, 0x0000000000000000
+
+.align 64
+ddq_add_5678:
+.quad 0x0000000000000005, 0x0000000000000000
+.quad 0x0000000000000006, 0x0000000000000000
+.quad 0x0000000000000007, 0x0000000000000000
+.quad 0x0000000000000008, 0x0000000000000000
+
+.align 64
+ddq_add_4444:
+.quad 0x0000000000000004, 0x0000000000000000
+.quad 0x0000000000000004, 0x0000000000000000
+.quad 0x0000000000000004, 0x0000000000000000
+.quad 0x0000000000000004, 0x0000000000000000
+
+.align 64
+ddq_add_8888:
+.quad 0x0000000000000008, 0x0000000000000000
+.quad 0x0000000000000008, 0x0000000000000000
+.quad 0x0000000000000008, 0x0000000000000000
+.quad 0x0000000000000008, 0x0000000000000000
+
+.align 64
+ddq_addbe_1234:
+.quad 0x0000000000000000, 0x0100000000000000
+.quad 0x0000000000000000, 0x0200000000000000
+.quad 0x0000000000000000, 0x0300000000000000
+.quad 0x0000000000000000, 0x0400000000000000
+
+.align 64
+ddq_addbe_4444:
+.quad 0x0000000000000000, 0x0400000000000000
+.quad 0x0000000000000000, 0x0400000000000000
+.quad 0x0000000000000000, 0x0400000000000000
+.quad 0x0000000000000000, 0x0400000000000000
+
+.align 64
+byte_len_to_mask_table:
+.value 0x0000, 0x0001, 0x0003, 0x0007
+.value 0x000f, 0x001f, 0x003f, 0x007f
+.value 0x00ff, 0x01ff, 0x03ff, 0x07ff
+.value 0x0fff, 0x1fff, 0x3fff, 0x7fff
+.value 0xffff
+
+.align 64
+byte64_len_to_mask_table:
+.quad 0x0000000000000000, 0x0000000000000001
+.quad 0x0000000000000003, 0x0000000000000007
+.quad 0x000000000000000f, 0x000000000000001f
+.quad 0x000000000000003f, 0x000000000000007f
+.quad 0x00000000000000ff, 0x00000000000001ff
+.quad 0x00000000000003ff, 0x00000000000007ff
+.quad 0x0000000000000fff, 0x0000000000001fff
+.quad 0x0000000000003fff, 0x0000000000007fff
+.quad 0x000000000000ffff, 0x000000000001ffff
+.quad 0x000000000003ffff, 0x000000000007ffff
+.quad 0x00000000000fffff, 0x00000000001fffff
+.quad 0x00000000003fffff, 0x00000000007fffff
+.quad 0x0000000000ffffff, 0x0000000001ffffff
+.quad 0x0000000003ffffff, 0x0000000007ffffff
+.quad 0x000000000fffffff, 0x000000001fffffff
+.quad 0x000000003fffffff, 0x000000007fffffff
+.quad 0x00000000ffffffff, 0x00000001ffffffff
+.quad 0x00000003ffffffff, 0x00000007ffffffff
+.quad 0x0000000fffffffff, 0x0000001fffffffff
+.quad 0x0000003fffffffff, 0x0000007fffffffff
+.quad 0x000000ffffffffff, 0x000001ffffffffff
+.quad 0x000003ffffffffff, 0x000007ffffffffff
+.quad 0x00000fffffffffff, 0x00001fffffffffff
+.quad 0x00003fffffffffff, 0x00007fffffffffff
+.quad 0x0000ffffffffffff, 0x0001ffffffffffff
+.quad 0x0003ffffffffffff, 0x0007ffffffffffff
+.quad 0x000fffffffffffff, 0x001fffffffffffff
+.quad 0x003fffffffffffff, 0x007fffffffffffff
+.quad 0x00ffffffffffffff, 0x01ffffffffffffff
+.quad 0x03ffffffffffffff, 0x07ffffffffffffff
+.quad 0x0fffffffffffffff, 0x1fffffffffffffff
+.quad 0x3fffffffffffffff, 0x7fffffffffffffff
+.quad 0xffffffffffffffff
+ .section ".note.gnu.property", "a"
+ .p2align 3
+ .long 1f - 0f
+ .long 4f - 1f
+ .long 5
+0:
+ # "GNU" encoded with .byte, since .asciz isn't supported
+ # on Solaris.
+ .byte 0x47
+ .byte 0x4e
+ .byte 0x55
+ .byte 0
+1:
+ .p2align 3
+ .long 0xc0000002
+ .long 3f - 2f
+2:
+ .long 3
+3:
+ .p2align 3
+4:
diff --git a/sys/crypto/openssl/amd64/ossl_aes_gcm.c b/sys/crypto/openssl/amd64/ossl_aes_gcm.c
new file mode 100644
index 000000000000..3381d35557f2
--- /dev/null
+++ b/sys/crypto/openssl/amd64/ossl_aes_gcm.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2010-2022 The OpenSSL Project Authors. All Rights Reserved.
+ * Copyright (c) 2021, Intel Corporation. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License"). You may not use
+ * this file except in compliance with the License. You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+/*
+ * This file contains a AES-GCM wrapper implementation from OpenSSL 3.1,
+ * targeting amd64 VAES extensions. This was ported from
+ * cipher_aes_gcm_hw_vaes_avx512.inc.
+ */
+
+#include <sys/endian.h>
+#include <sys/systm.h>
+
+#include <crypto/openssl/ossl.h>
+#include <crypto/openssl/ossl_aes_gcm.h>
+#include <crypto/openssl/ossl_cipher.h>
+
+#include <opencrypto/cryptodev.h>
+
+_Static_assert(
+ sizeof(struct ossl_gcm_context) <= sizeof(struct ossl_cipher_context),
+ "ossl_gcm_context too large");
+
+void aesni_set_encrypt_key(const void *key, int bits, void *ctx);
+
+static void
+gcm_init(struct ossl_gcm_context *ctx, const void *key, size_t keylen)
+{
+ KASSERT(keylen == 128 || keylen == 192 || keylen == 256,
+ ("%s: invalid key length %zu", __func__, keylen));
+
+ memset(&ctx->gcm, 0, sizeof(ctx->gcm));
+ memset(&ctx->aes_ks, 0, sizeof(ctx->aes_ks));
+ aesni_set_encrypt_key(key, keylen, &ctx->aes_ks);
+ ctx->ops->init(ctx, key, keylen);
+}
+
+static void
+gcm_tag(struct ossl_gcm_context *ctx, unsigned char *tag, size_t len)
+{
+ (void)ctx->ops->finish(ctx, NULL, 0);
+ memcpy(tag, ctx->gcm.Xi.c, len);
+}
+
+void ossl_gcm_gmult_avx512(uint64_t Xi[2], void *gcm128ctx);
+void ossl_aes_gcm_init_avx512(const void *ks, void *gcm128ctx);
+void ossl_aes_gcm_setiv_avx512(const void *ks, void *gcm128ctx,
+ const unsigned char *iv, size_t ivlen);
+void ossl_aes_gcm_update_aad_avx512(void *gcm128ctx, const unsigned char *aad,
+ size_t len);
+void ossl_aes_gcm_encrypt_avx512(const void *ks, void *gcm128ctx,
+ unsigned int *pblocklen, const unsigned char *in, size_t len,
+ unsigned char *out);
+void ossl_aes_gcm_decrypt_avx512(const void *ks, void *gcm128ctx,
+ unsigned int *pblocklen, const unsigned char *in, size_t len,
+ unsigned char *out);
+void ossl_aes_gcm_finalize_avx512(void *gcm128ctx, unsigned int pblocklen);
+
+static void
+gcm_init_avx512(struct ossl_gcm_context *ctx, const void *key, size_t keylen)
+{
+ ossl_aes_gcm_init_avx512(&ctx->aes_ks, &ctx->gcm);
+}
+
+static void
+gcm_setiv_avx512(struct ossl_gcm_context *ctx, const unsigned char *iv,
+ size_t len)
+{
+ KASSERT(len == AES_GCM_IV_LEN,
+ ("%s: invalid IV length %zu", __func__, len));
+
+ ctx->gcm.Yi.u[0] = 0; /* Current counter */
+ ctx->gcm.Yi.u[1] = 0;
+ ctx->gcm.Xi.u[0] = 0; /* AAD hash */
+ ctx->gcm.Xi.u[1] = 0;
+ ctx->gcm.len.u[0] = 0; /* AAD length */
+ ctx->gcm.len.u[1] = 0; /* Message length */
+ ctx->gcm.ares = 0;
+ ctx->gcm.mres = 0;
+
+ ossl_aes_gcm_setiv_avx512(&ctx->aes_ks, ctx, iv, len);
+}
+
+static int
+gcm_aad_avx512(struct ossl_gcm_context *ctx, const unsigned char *aad,
+ size_t len)
+{
+ uint64_t alen = ctx->gcm.len.u[0];
+ size_t lenblks;
+ unsigned int ares;
+
+ /* Bad sequence: call of AAD update after message processing */
+ if (ctx->gcm.len.u[1])
+ return -2;
+
+ alen += len;
+ /* AAD is limited by 2^64 bits, thus 2^61 bytes */
+ if (alen > (1ull << 61) || (sizeof(len) == 8 && alen < len))
+ return -1;
+ ctx->gcm.len.u[0] = alen;
+
+ ares = ctx->gcm.ares;
+ /* Partial AAD block left from previous AAD update calls */
+ if (ares > 0) {
+ /*
+ * Fill partial block buffer till full block
+ * (note, the hash is stored reflected)
+ */
+ while (ares > 0 && len > 0) {
+ ctx->gcm.Xi.c[15 - ares] ^= *(aad++);
+ --len;
+ ares = (ares + 1) % AES_BLOCK_LEN;
+ }
+ /* Full block gathered */
+ if (ares == 0) {
+ ossl_gcm_gmult_avx512(ctx->gcm.Xi.u, ctx);
+ } else { /* no more AAD */
+ ctx->gcm.ares = ares;
+ return 0;
+ }
+ }
+
+ /* Bulk AAD processing */
+ lenblks = len & ((size_t)(-AES_BLOCK_LEN));
+ if (lenblks > 0) {
+ ossl_aes_gcm_update_aad_avx512(ctx, aad, lenblks);
+ aad += lenblks;
+ len -= lenblks;
+ }
+
+ /* Add remaining AAD to the hash (note, the hash is stored reflected) */
+ if (len > 0) {
+ ares = (unsigned int)len;
+ for (size_t i = 0; i < len; ++i)
+ ctx->gcm.Xi.c[15 - i] ^= aad[i];
+ }
+
+ ctx->gcm.ares = ares;
+
+ return 0;
+}
+
+static int
+_gcm_encrypt_avx512(struct ossl_gcm_context *ctx, const unsigned char *in,
+ unsigned char *out, size_t len, bool encrypt)
+{
+ uint64_t mlen = ctx->gcm.len.u[1];
+
+ mlen += len;
+ if (mlen > ((1ull << 36) - 32) || (sizeof(len) == 8 && mlen < len))
+ return -1;
+
+ ctx->gcm.len.u[1] = mlen;
+
+ /* Finalize GHASH(AAD) if AAD partial blocks left unprocessed */
+ if (ctx->gcm.ares > 0) {
+ ossl_gcm_gmult_avx512(ctx->gcm.Xi.u, ctx);
+ ctx->gcm.ares = 0;
+ }
+
+ if (encrypt) {
+ ossl_aes_gcm_encrypt_avx512(&ctx->aes_ks, ctx, &ctx->gcm.mres,
+ in, len, out);
+ } else {
+ ossl_aes_gcm_decrypt_avx512(&ctx->aes_ks, ctx, &ctx->gcm.mres,
+ in, len, out);
+ }
+
+ return 0;
+}
+
+static int
+gcm_encrypt_avx512(struct ossl_gcm_context *ctx, const unsigned char *in,
+ unsigned char *out, size_t len)
+{
+ return _gcm_encrypt_avx512(ctx, in, out, len, true);
+}
+
+static int
+gcm_decrypt_avx512(struct ossl_gcm_context *ctx, const unsigned char *in,
+ unsigned char *out, size_t len)
+{
+ return _gcm_encrypt_avx512(ctx, in, out, len, false);
+}
+
+static int
+gcm_finish_avx512(struct ossl_gcm_context *ctx, const unsigned char *tag,
+ size_t len)
+{
+ unsigned int *res = &ctx->gcm.mres;
+
+ /* Finalize AAD processing */
+ if (ctx->gcm.ares > 0)
+ res = &ctx->gcm.ares;
+
+ ossl_aes_gcm_finalize_avx512(ctx, *res);
+
+ ctx->gcm.ares = ctx->gcm.mres = 0;
+
+ if (tag != NULL)
+ return timingsafe_bcmp(ctx->gcm.Xi.c, tag, len);
+ return 0;
+}
+
+static const struct ossl_aes_gcm_ops gcm_ops_avx512 = {
+ .init = gcm_init_avx512,
+ .setiv = gcm_setiv_avx512,
+ .aad = gcm_aad_avx512,
+ .encrypt = gcm_encrypt_avx512,
+ .decrypt = gcm_decrypt_avx512,
+ .finish = gcm_finish_avx512,
+ .tag = gcm_tag,
+};
+
+int ossl_aes_gcm_setkey_avx512(const unsigned char *key, int klen, void *_ctx);
+
+int
+ossl_aes_gcm_setkey_avx512(const unsigned char *key, int klen,
+ void *_ctx)
+{
+ struct ossl_gcm_context *ctx;
+
+ ctx = _ctx;
+ ctx->ops = &gcm_ops_avx512;
+ gcm_init(ctx, key, klen);
+ return (0);
+}
diff --git a/sys/crypto/openssl/ossl.c b/sys/crypto/openssl/ossl.c
index 9c3465b264b7..723d1a80543d 100644
--- a/sys/crypto/openssl/ossl.c
+++ b/sys/crypto/openssl/ossl.c
@@ -78,6 +78,8 @@ ossl_attach(device_t dev)
sc = device_get_softc(dev);
+ sc->has_aes = sc->has_aes_gcm = false;
+
ossl_cpuid(sc);
sc->sc_cid = crypto_get_driverid(dev, sizeof(struct ossl_session),
CRYPTOCAP_F_SOFTWARE | CRYPTOCAP_F_SYNC |
@@ -144,6 +146,16 @@ ossl_lookup_cipher(const struct crypto_session_params *csp)
return (NULL);
}
return (&ossl_cipher_aes_cbc);
+ case CRYPTO_AES_NIST_GCM_16:
+ switch (csp->csp_cipher_klen * 8) {
+ case 128:
+ case 192:
+ case 256:
+ break;
+ default:
+ return (NULL);
+ }
+ return (&ossl_cipher_aes_gcm);
case CRYPTO_CHACHA20:
if (csp->csp_cipher_klen != CHACHA_KEY_SIZE)
return (NULL);
@@ -183,6 +195,15 @@ ossl_probesession(device_t dev, const struct crypto_session_params *csp)
switch (csp->csp_cipher_alg) {
case CRYPTO_CHACHA20_POLY1305:
break;
+ case CRYPTO_AES_NIST_GCM_16:
+ if (!sc->has_aes_gcm || ossl_lookup_cipher(csp) == NULL)
+ return (EINVAL);
+ if (csp->csp_ivlen != AES_GCM_IV_LEN)
+ return (EINVAL);
+ if (csp->csp_auth_mlen != 0 &&
+ csp->csp_auth_mlen != GMAC_DIGEST_LEN)
+ return (EINVAL);
+ break;
default:
return (EINVAL);
}
@@ -279,6 +300,11 @@ ossl_newsession(device_t dev, crypto_session_t cses,
ossl_newsession_hash(s, csp);
error = ossl_newsession_cipher(s, csp);
break;
+ case CSP_MODE_AEAD:
+ error = ossl_newsession_cipher(s, csp);
+ break;
+ default:
+ __assert_unreachable();
}
return (error);
@@ -353,6 +379,13 @@ out:
}
static int
+ossl_process_cipher(struct ossl_session *s, struct cryptop *crp,
+ const struct crypto_session_params *csp)
+{
+ return (s->cipher.cipher->process(&s->cipher, crp, csp));
+}
+
+static int
ossl_process_eta(struct ossl_session *s, struct cryptop *crp,
const struct crypto_session_params *csp)
{
@@ -372,6 +405,20 @@ ossl_process_eta(struct ossl_session *s, struct cryptop *crp,
}
static int
+ossl_process_aead(struct ossl_session *s, struct cryptop *crp,
+ const struct crypto_session_params *csp)
+{
+ if (csp->csp_cipher_alg == CRYPTO_CHACHA20_POLY1305) {
+ if (CRYPTO_OP_IS_ENCRYPT(crp->crp_op))
+ return (ossl_chacha20_poly1305_encrypt(crp, csp));
+ else
+ return (ossl_chacha20_poly1305_decrypt(crp, csp));
+ } else {
+ return (s->cipher.cipher->process(&s->cipher, crp, csp));
+ }
+}
+
+static int
ossl_process(device_t dev, struct cryptop *crp, int hint)
{
const struct crypto_session_params *csp;
@@ -394,16 +441,13 @@ ossl_process(device_t dev, struct cryptop *crp, int hint)
error = ossl_process_hash(s, crp, csp);
break;
case CSP_MODE_CIPHER:
- error = s->cipher.cipher->process(&s->cipher, crp, csp);
+ error = ossl_process_cipher(s, crp, csp);
break;
case CSP_MODE_ETA:
error = ossl_process_eta(s, crp, csp);
break;
case CSP_MODE_AEAD:
- if (CRYPTO_OP_IS_ENCRYPT(crp->crp_op))
- error = ossl_chacha20_poly1305_encrypt(crp, csp);
- else
- error = ossl_chacha20_poly1305_decrypt(crp, csp);
+ error = ossl_process_aead(s, crp, csp);
break;
default:
__assert_unreachable();
diff --git a/sys/crypto/openssl/ossl.h b/sys/crypto/openssl/ossl.h
index 4f5353818add..3b9313251cff 100644
--- a/sys/crypto/openssl/ossl.h
+++ b/sys/crypto/openssl/ossl.h
@@ -48,15 +48,16 @@ void ossl_cpuid(struct ossl_softc *sc);
struct ossl_softc {
int32_t sc_cid;
bool has_aes;
+ bool has_aes_gcm;
};
/* Needs to be big enough to hold any hash context. */
struct ossl_hash_context {
- uint32_t dummy[61];
+ uint32_t dummy[196];
} __aligned(32);
struct ossl_cipher_context {
- uint32_t dummy[61];
+ uint32_t dummy[196];
} __aligned(32);
struct ossl_session_hash {
@@ -85,6 +86,7 @@ extern struct auth_hash ossl_hash_sha384;
extern struct auth_hash ossl_hash_sha512;
extern struct ossl_cipher ossl_cipher_aes_cbc;
+extern struct ossl_cipher ossl_cipher_aes_gcm;
extern struct ossl_cipher ossl_cipher_chacha20;
#endif /* !__OSSL_H__ */
diff --git a/sys/crypto/openssl/ossl_aes.c b/sys/crypto/openssl/ossl_aes.c
index 382fa80cc56b..93d3ac3f2a99 100644
--- a/sys/crypto/openssl/ossl_aes.c
+++ b/sys/crypto/openssl/ossl_aes.c
@@ -32,8 +32,10 @@ __FBSDID("$FreeBSD$");
#include <sys/malloc.h>
#include <opencrypto/cryptodev.h>
+#include <opencrypto/gmac.h>
#include <crypto/openssl/ossl.h>
+#include <crypto/openssl/ossl_aes_gcm.h>
#include <crypto/openssl/ossl_cipher.h>
#if defined(__amd64__) || defined(__i386__)
@@ -43,6 +45,7 @@ __FBSDID("$FreeBSD$");
#endif
static ossl_cipher_process_t ossl_aes_cbc;
+static ossl_cipher_process_t ossl_aes_gcm;
struct ossl_cipher ossl_cipher_aes_cbc = {
.type = CRYPTO_AES_CBC,
@@ -55,6 +58,17 @@ struct ossl_cipher ossl_cipher_aes_cbc = {
.process = ossl_aes_cbc
};
+struct ossl_cipher ossl_cipher_aes_gcm = {
+ .type = CRYPTO_AES_NIST_GCM_16,
+ .blocksize = 1,
+ .ivsize = AES_GCM_IV_LEN,
+
+ /* Filled during initialization based on CPU caps. */
+ .set_encrypt_key = NULL,
+ .set_decrypt_key = NULL,
+ .process = ossl_aes_gcm,
+};
+
static int
ossl_aes_cbc(struct ossl_session_cipher *s, struct cryptop *crp,
const struct crypto_session_params *csp)
@@ -151,3 +165,92 @@ ossl_aes_cbc(struct ossl_session_cipher *s, struct cryptop *crp,
explicit_bzero(&key, sizeof(key));
return (0);
}
+
+static int
+ossl_aes_gcm(struct ossl_session_cipher *s, struct cryptop *crp,
+ const struct crypto_session_params *csp)
+{
+ struct ossl_cipher_context key;
+ struct crypto_buffer_cursor cc_in, cc_out;
+ unsigned char iv[AES_BLOCK_LEN], tag[AES_BLOCK_LEN];
+ struct ossl_gcm_context *ctx;
+ const unsigned char *inseg;
+ unsigned char *outseg;
+ size_t inlen, outlen, seglen;
+ int error;
+ bool encrypt;
+
+ encrypt = CRYPTO_OP_IS_ENCRYPT(crp->crp_op);
+
+ if (crp->crp_cipher_key != NULL) {
+ if (encrypt)
+ error = s->cipher->set_encrypt_key(crp->crp_cipher_key,
+ 8 * csp->csp_cipher_klen, &key);
+ else
+ error = s->cipher->set_decrypt_key(crp->crp_cipher_key,
+ 8 * csp->csp_cipher_klen, &key);
+ if (error)
+ return (error);
+ ctx = (struct ossl_gcm_context *)&key;
+ } else if (encrypt) {
+ ctx = (struct ossl_gcm_context *)&s->enc_ctx;
+ } else {
+ ctx = (struct ossl_gcm_context *)&s->dec_ctx;
+ }
+
+ crypto_read_iv(crp, iv);
+ ctx->ops->setiv(ctx, iv, csp->csp_ivlen);
+
+ crypto_cursor_init(&cc_in, &crp->crp_buf);
+ crypto_cursor_advance(&cc_in, crp->crp_aad_start);
+ for (size_t alen = crp->crp_aad_length; alen > 0; alen -= seglen) {
+ inseg = crypto_cursor_segment(&cc_in, &inlen);
+ seglen = MIN(alen, inlen);
+ if (ctx->ops->aad(ctx, inseg, seglen) != 0)
+ return (EINVAL);
+ crypto_cursor_advance(&cc_in, seglen);
+ }
+
+ crypto_cursor_init(&cc_in, &crp->crp_buf);
+ crypto_cursor_advance(&cc_in, crp->crp_payload_start);
+ if (CRYPTO_HAS_OUTPUT_BUFFER(crp)) {
+ crypto_cursor_init(&cc_out, &crp->crp_obuf);
+ crypto_cursor_advance(&cc_out, crp->crp_payload_output_start);
+ } else {
+ cc_out = cc_in;
+ }
+
+ for (size_t plen = crp->crp_payload_length; plen > 0; plen -= seglen) {
+ inseg = crypto_cursor_segment(&cc_in, &inlen);
+ outseg = crypto_cursor_segment(&cc_out, &outlen);
+ seglen = MIN(plen, MIN(inlen, outlen));
+
+ if (encrypt) {
+ if (ctx->ops->encrypt(ctx, inseg, outseg, seglen) != 0)
+ return (EINVAL);
+ } else {
+ if (ctx->ops->decrypt(ctx, inseg, outseg, seglen) != 0)
+ return (EINVAL);
+ }
+
+ crypto_cursor_advance(&cc_in, seglen);
+ crypto_cursor_advance(&cc_out, seglen);
+ }
+
+ error = 0;
+ if (encrypt) {
+ ctx->ops->tag(ctx, tag, GMAC_DIGEST_LEN);
+ crypto_copyback(crp, crp->crp_digest_start, GMAC_DIGEST_LEN,
+ tag);
+ } else {
+ crypto_copydata(crp, crp->crp_digest_start, GMAC_DIGEST_LEN,
+ tag);
+ if (ctx->ops->finish(ctx, tag, GMAC_DIGEST_LEN) != 0)
+ error = EBADMSG;
+ }
+
+ explicit_bzero(iv, sizeof(iv));
+ explicit_bzero(tag, sizeof(tag));
+
+ return (error);
+}
diff --git a/sys/crypto/openssl/ossl_aes_gcm.h b/sys/crypto/openssl/ossl_aes_gcm.h
new file mode 100644
index 000000000000..9ce8ee193483
--- /dev/null
+++ b/sys/crypto/openssl/ossl_aes_gcm.h
@@ -0,0 +1,71 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2023 Stormshield
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OSSL_AES_GCM_H_
+#define _OSSL_AES_GCM_H_
+
+#include <crypto/openssl/ossl_cipher.h>
+#include <crypto/rijndael/rijndael.h>
+
+struct ossl_gcm_context;
+
+struct ossl_aes_gcm_ops {
+ void (*init)(struct ossl_gcm_context *ctx, const void *key,
+ size_t keylen);
+ void (*setiv)(struct ossl_gcm_context *ctx, const unsigned char *iv,
+ size_t ivlen);
+ int (*aad)(struct ossl_gcm_context *ctx, const unsigned char *aad,
+ size_t len);
+ int (*encrypt)(struct ossl_gcm_context *ctx, const unsigned char *in,
+ unsigned char *out, size_t len);
+ int (*decrypt)(struct ossl_gcm_context *ctx, const unsigned char *in,
+ unsigned char *out, size_t len);
+ int (*finish)(struct ossl_gcm_context *ctx, const unsigned char *tag,
+ size_t len);
+ void (*tag)(struct ossl_gcm_context *ctx, unsigned char *tag,
+ size_t len);
+};
+
+struct ossl_gcm_context {
+ struct {
+ union {
+ uint64_t u[2];
+ uint32_t d[4];
+ uint8_t c[16];
+ } Yi, EKi, EK0, len, Xi, H;
+ __uint128_t Htable[16];
+ unsigned int mres, ares;
+ } gcm;
+
+ struct {
+ uint32_t ks[4 * (RIJNDAEL_MAXNR + 1)];
+ int rounds;
+ } aes_ks;
+
+ const struct ossl_aes_gcm_ops *ops;
+};
+
+#endif /* !_OSSL_AES_GCM_H_ */
diff --git a/sys/crypto/openssl/ossl_x86.c b/sys/crypto/openssl/ossl_x86.c
index 75598d821506..594aee2ab97f 100644
--- a/sys/crypto/openssl/ossl_x86.c
+++ b/sys/crypto/openssl/ossl_x86.c
@@ -39,6 +39,7 @@
#include <x86/specialreg.h>
#include <crypto/openssl/ossl.h>
+#include <crypto/openssl/ossl_aes_gcm.h>
#include <crypto/openssl/ossl_cipher.h>
/*
@@ -55,6 +56,11 @@ unsigned int OPENSSL_ia32cap_P[4];
ossl_cipher_setkey_t aesni_set_encrypt_key;
ossl_cipher_setkey_t aesni_set_decrypt_key;
+#ifdef __amd64__
+int ossl_vaes_vpclmulqdq_capable(void);
+ossl_cipher_setkey_t ossl_aes_gcm_setkey_avx512;
+#endif
+
void
ossl_cpuid(struct ossl_softc *sc)
{
@@ -119,11 +125,24 @@ ossl_cpuid(struct ossl_softc *sc)
}
OPENSSL_ia32cap_P[3] = cpu_stdext_feature2;
- if (!AESNI_CAPABLE) {
- sc->has_aes = false;
+ if (!AESNI_CAPABLE)
return;
- }
+
sc->has_aes = true;
ossl_cipher_aes_cbc.set_encrypt_key = aesni_set_encrypt_key;
ossl_cipher_aes_cbc.set_decrypt_key = aesni_set_decrypt_key;
+
+#ifdef __amd64__
+ if (ossl_vaes_vpclmulqdq_capable()) {
+ ossl_cipher_aes_gcm.set_encrypt_key =
+ ossl_aes_gcm_setkey_avx512;
+ ossl_cipher_aes_gcm.set_decrypt_key =
+ ossl_aes_gcm_setkey_avx512;
+ sc->has_aes_gcm = true;
+ } else {
+ sc->has_aes_gcm = false;
+ }
+#else
+ sc->has_aes_gcm = false;
+#endif
}
diff --git a/sys/modules/ossl/Makefile b/sys/modules/ossl/Makefile
index 765e70a03edd..d56fef428494 100644
--- a/sys/modules/ossl/Makefile
+++ b/sys/modules/ossl/Makefile
@@ -27,12 +27,14 @@ SRCS.aarch64= \
ossl_aarch64.c
SRCS.amd64= \
+ aes-gcm-avx512.S \
aesni-x86_64.S \
chacha-x86_64.S \
poly1305-x86_64.S \
sha1-x86_64.S \
sha256-x86_64.S \
sha512-x86_64.S \
+ ossl_aes_gcm.c \
ossl_x86.c
SRCS.i386= \