aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJung-uk Kim <jkim@FreeBSD.org>2020-08-26 16:56:44 +0000
committerJung-uk Kim <jkim@FreeBSD.org>2020-08-26 16:56:44 +0000
commit3971092e119dd117e9e40f6b5955f54a2762dcf3 (patch)
tree0bba9eb1e9bd17761c4e9bec210a13af4cbcca35
parent63c1bb51629b1bdb150885c72bd297ff7d7f228a (diff)
downloadsrc-3971092e119dd117e9e40f6b5955f54a2762dcf3.tar.gz
src-3971092e119dd117e9e40f6b5955f54a2762dcf3.zip
Regen X86 assembly files after r364822.
Notes
Notes: svn path=/head/; revision=364823
-rw-r--r--secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S784
-rw-r--r--secure/lib/libcrypto/amd64/aesni-mb-x86_64.S965
-rw-r--r--secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S1350
-rw-r--r--secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S4376
-rw-r--r--secure/lib/libcrypto/amd64/chacha-x86_64.S1026
-rw-r--r--secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S2055
-rw-r--r--secure/lib/libcrypto/amd64/ghash-x86_64.S475
-rw-r--r--secure/lib/libcrypto/amd64/poly1305-x86_64.S1785
-rw-r--r--secure/lib/libcrypto/amd64/rsaz-avx2.S1749
-rw-r--r--secure/lib/libcrypto/amd64/rsaz-x86_64.S664
-rw-r--r--secure/lib/libcrypto/amd64/sha1-mb-x86_64.S4315
-rw-r--r--secure/lib/libcrypto/amd64/sha1-x86_64.S2829
-rw-r--r--secure/lib/libcrypto/amd64/sha256-mb-x86_64.S4672
-rw-r--r--secure/lib/libcrypto/amd64/sha256-x86_64.S2369
-rw-r--r--secure/lib/libcrypto/amd64/sha512-x86_64.S3660
-rw-r--r--secure/lib/libcrypto/amd64/x25519-x86_64.S390
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont.S380
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont5.S1365
-rw-r--r--secure/lib/libcrypto/i386/chacha-x86.S960
-rw-r--r--secure/lib/libcrypto/i386/poly1305-x86.S1110
-rw-r--r--secure/lib/libcrypto/i386/sha1-586.S2350
-rw-r--r--secure/lib/libcrypto/i386/sha256-586.S4496
22 files changed, 44039 insertions, 86 deletions
diff --git a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
index 1cdcc86043b2..26e49f9b2979 100644
--- a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
@@ -2,20 +2,790 @@
/* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */
.text
-.globl aesni_gcm_encrypt
-.type aesni_gcm_encrypt,@function
-aesni_gcm_encrypt:
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
.cfi_startproc
- xorl %eax,%eax
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
-
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function
+.align 32
aesni_gcm_decrypt:
.cfi_startproc
- xorl %eax,%eax
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+.cfi_startproc
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.cfi_endproc
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+.cfi_startproc
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
index de4bac9488f7..706c5c59d38d 100644
--- a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
@@ -9,6 +9,14 @@
.align 32
aesni_multi_cbc_encrypt:
.cfi_startproc
+ cmpl $2,%edx
+ jb .Lenc_non_avx
+ movl OPENSSL_ia32cap_P+4(%rip),%ecx
+ testl $268435456,%ecx
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -283,6 +291,14 @@ aesni_multi_cbc_encrypt:
.align 32
aesni_multi_cbc_decrypt:
.cfi_startproc
+ cmpl $2,%edx
+ jb .Ldec_non_avx
+ movl OPENSSL_ia32cap_P+4(%rip),%ecx
+ testl $268435456,%ecx
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -542,3 +558,952 @@ aesni_multi_cbc_decrypt:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_encrypt_avx,@function
+.align 32
+aesni_multi_cbc_encrypt_avx:
+.cfi_startproc
+_avx_cbc_enc_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+
+
+ subq $192,%rsp
+ andq $-128,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu (%rsi),%xmm15
+ leaq 120(%rsi),%rsi
+ leaq 160(%rdi),%rdi
+ shrl $1,%edx
+
+.Lenc8x_loop_grande:
+
+ xorl %edx,%edx
+ movl -144(%rdi),%ecx
+ movq -160(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -152(%rdi),%rbx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -136(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ subq %r8,%rbx
+ movq %rbx,64(%rsp)
+ movl -104(%rdi),%ecx
+ movq -120(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -112(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -96(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ subq %r9,%rbp
+ movq %rbp,72(%rsp)
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r10
+ cmpl %edx,%ecx
+ movq -72(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -56(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ subq %r10,%rbp
+ movq %rbp,80(%rsp)
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq -32(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -16(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ subq %r11,%rbp
+ movq %rbp,88(%rsp)
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r12
+ cmpl %edx,%ecx
+ movq 8(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 24(%rdi),%xmm6
+ movl %ecx,48(%rsp)
+ cmovleq %rsp,%r12
+ subq %r12,%rbp
+ movq %rbp,96(%rsp)
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r13
+ cmpl %edx,%ecx
+ movq 48(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 64(%rdi),%xmm7
+ movl %ecx,52(%rsp)
+ cmovleq %rsp,%r13
+ subq %r13,%rbp
+ movq %rbp,104(%rsp)
+ movl 96(%rdi),%ecx
+ movq 80(%rdi),%r14
+ cmpl %edx,%ecx
+ movq 88(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 104(%rdi),%xmm8
+ movl %ecx,56(%rsp)
+ cmovleq %rsp,%r14
+ subq %r14,%rbp
+ movq %rbp,112(%rsp)
+ movl 136(%rdi),%ecx
+ movq 120(%rdi),%r15
+ cmpl %edx,%ecx
+ movq 128(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 144(%rdi),%xmm9
+ movl %ecx,60(%rsp)
+ cmovleq %rsp,%r15
+ subq %r15,%rbp
+ movq %rbp,120(%rsp)
+ testl %edx,%edx
+ jz .Lenc8x_done
+
+ vmovups 16-120(%rsi),%xmm1
+ vmovups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+
+ vpxor (%r8),%xmm15,%xmm10
+ leaq 128(%rsp),%rbp
+ vpxor (%r9),%xmm15,%xmm11
+ vpxor (%r10),%xmm15,%xmm12
+ vpxor (%r11),%xmm15,%xmm13
+ vpxor %xmm10,%xmm2,%xmm2
+ vpxor (%r12),%xmm15,%xmm10
+ vpxor %xmm11,%xmm3,%xmm3
+ vpxor (%r13),%xmm15,%xmm11
+ vpxor %xmm12,%xmm4,%xmm4
+ vpxor (%r14),%xmm15,%xmm12
+ vpxor %xmm13,%xmm5,%xmm5
+ vpxor (%r15),%xmm15,%xmm13
+ vpxor %xmm10,%xmm6,%xmm6
+ movl $1,%ecx
+ vpxor %xmm11,%xmm7,%xmm7
+ vpxor %xmm12,%xmm8,%xmm8
+ vpxor %xmm13,%xmm9,%xmm9
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+0(%rsp),%ecx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r8)
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r8,%rbx,1),%rbx
+ cmovgeq %rsp,%r8
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r8,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r8),%xmm15,%xmm10
+ movq %rbx,64+0(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -72(%rsi),%xmm1
+ leaq 16(%r8,%rbx,1),%r8
+ vmovdqu %xmm10,0(%rbp)
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+4(%rsp),%ecx
+ movq 64+8(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r9)
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r9,%rbx,1),%rbx
+ cmovgeq %rsp,%r9
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r9,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r9),%xmm15,%xmm11
+ movq %rbx,64+8(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups -56(%rsi),%xmm0
+ leaq 16(%r9,%rbx,1),%r9
+ vmovdqu %xmm11,16(%rbp)
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+8(%rsp),%ecx
+ movq 64+16(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r10)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r8)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r10,%rbx,1),%rbx
+ cmovgeq %rsp,%r10
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r10,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r10),%xmm15,%xmm12
+ movq %rbx,64+16(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -40(%rsi),%xmm1
+ leaq 16(%r10,%rbx,1),%r10
+ vmovdqu %xmm12,32(%rbp)
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+12(%rsp),%ecx
+ movq 64+24(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r11)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r9)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r11,%rbx,1),%rbx
+ cmovgeq %rsp,%r11
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r11,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r11),%xmm15,%xmm13
+ movq %rbx,64+24(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups -24(%rsi),%xmm0
+ leaq 16(%r11,%rbx,1),%r11
+ vmovdqu %xmm13,48(%rbp)
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+16(%rsp),%ecx
+ movq 64+32(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r12)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r10)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r12,%rbx,1),%rbx
+ cmovgeq %rsp,%r12
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r12,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r12),%xmm15,%xmm10
+ movq %rbx,64+32(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -8(%rsi),%xmm1
+ leaq 16(%r12,%rbx,1),%r12
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+20(%rsp),%ecx
+ movq 64+40(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r13)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r11)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%rbx,%r13,1),%rbx
+ cmovgeq %rsp,%r13
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r13,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r13),%xmm15,%xmm11
+ movq %rbx,64+40(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 8(%rsi),%xmm0
+ leaq 16(%r13,%rbx,1),%r13
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+24(%rsp),%ecx
+ movq 64+48(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r14)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r12)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r14,%rbx,1),%rbx
+ cmovgeq %rsp,%r14
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r14,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r14),%xmm15,%xmm12
+ movq %rbx,64+48(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 24(%rsi),%xmm1
+ leaq 16(%r14,%rbx,1),%r14
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+28(%rsp),%ecx
+ movq 64+56(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r15)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r13)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r15,%rbx,1),%rbx
+ cmovgeq %rsp,%r15
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r15,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r15),%xmm15,%xmm13
+ movq %rbx,64+56(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 40(%rsi),%xmm0
+ leaq 16(%r15,%rbx,1),%r15
+ vmovdqu 32(%rsp),%xmm14
+ prefetcht0 15(%r14)
+ prefetcht0 15(%r15)
+ cmpl $11,%eax
+ jb .Lenc8x_tail
+
+ vaesenc %xmm1,%xmm2,%xmm2
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 176-120(%rsi),%xmm1
+
+ vaesenc %xmm0,%xmm2,%xmm2
+ vaesenc %xmm0,%xmm3,%xmm3
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ vaesenc %xmm0,%xmm6,%xmm6
+ vaesenc %xmm0,%xmm7,%xmm7
+ vaesenc %xmm0,%xmm8,%xmm8
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 192-120(%rsi),%xmm0
+ je .Lenc8x_tail
+
+ vaesenc %xmm1,%xmm2,%xmm2
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 208-120(%rsi),%xmm1
+
+ vaesenc %xmm0,%xmm2,%xmm2
+ vaesenc %xmm0,%xmm3,%xmm3
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ vaesenc %xmm0,%xmm6,%xmm6
+ vaesenc %xmm0,%xmm7,%xmm7
+ vaesenc %xmm0,%xmm8,%xmm8
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 224-120(%rsi),%xmm0
+
+.Lenc8x_tail:
+ vaesenc %xmm1,%xmm2,%xmm2
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vpaddd %xmm14,%xmm15,%xmm15
+ vmovdqu 48(%rsp),%xmm14
+ vaesenc %xmm1,%xmm7,%xmm7
+ movq 64(%rsp),%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 16-120(%rsi),%xmm1
+
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vmovdqa %xmm15,32(%rsp)
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesenclast %xmm0,%xmm3,%xmm3
+ vaesenclast %xmm0,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesenclast %xmm0,%xmm5,%xmm5
+ vaesenclast %xmm0,%xmm6,%xmm6
+ vpaddd %xmm15,%xmm14,%xmm14
+ vmovdqu -120(%rsi),%xmm15
+ vaesenclast %xmm0,%xmm7,%xmm7
+ vaesenclast %xmm0,%xmm8,%xmm8
+ vmovdqa %xmm14,48(%rsp)
+ vaesenclast %xmm0,%xmm9,%xmm9
+ vmovups 32-120(%rsi),%xmm0
+
+ vmovups %xmm2,-16(%r8)
+ subq %rbx,%r8
+ vpxor 0(%rbp),%xmm2,%xmm2
+ vmovups %xmm3,-16(%r9)
+ subq 72(%rsp),%r9
+ vpxor 16(%rbp),%xmm3,%xmm3
+ vmovups %xmm4,-16(%r10)
+ subq 80(%rsp),%r10
+ vpxor 32(%rbp),%xmm4,%xmm4
+ vmovups %xmm5,-16(%r11)
+ subq 88(%rsp),%r11
+ vpxor 48(%rbp),%xmm5,%xmm5
+ vmovups %xmm6,-16(%r12)
+ subq 96(%rsp),%r12
+ vpxor %xmm10,%xmm6,%xmm6
+ vmovups %xmm7,-16(%r13)
+ subq 104(%rsp),%r13
+ vpxor %xmm11,%xmm7,%xmm7
+ vmovups %xmm8,-16(%r14)
+ subq 112(%rsp),%r14
+ vpxor %xmm12,%xmm8,%xmm8
+ vmovups %xmm9,-16(%r15)
+ subq 120(%rsp),%r15
+ vpxor %xmm13,%xmm9,%xmm9
+
+ decl %edx
+ jnz .Loop_enc8x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+
+
+
+
+
+.Lenc8x_done:
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lenc8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type aesni_multi_cbc_decrypt_avx,@function
+.align 32
+aesni_multi_cbc_decrypt_avx:
+.cfi_startproc
+_avx_cbc_dec_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+
+
+
+ subq $256,%rsp
+ andq $-256,%rsp
+ subq $192,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Ldec8x_body:
+ vzeroupper
+ vmovdqu (%rsi),%xmm15
+ leaq 120(%rsi),%rsi
+ leaq 160(%rdi),%rdi
+ shrl $1,%edx
+
+.Ldec8x_loop_grande:
+
+ xorl %edx,%edx
+ movl -144(%rdi),%ecx
+ movq -160(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -152(%rdi),%rbx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -136(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ subq %r8,%rbx
+ movq %rbx,64(%rsp)
+ vmovdqu %xmm2,192(%rsp)
+ movl -104(%rdi),%ecx
+ movq -120(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -112(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -96(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ subq %r9,%rbp
+ movq %rbp,72(%rsp)
+ vmovdqu %xmm3,208(%rsp)
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r10
+ cmpl %edx,%ecx
+ movq -72(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -56(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ subq %r10,%rbp
+ movq %rbp,80(%rsp)
+ vmovdqu %xmm4,224(%rsp)
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq -32(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -16(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ subq %r11,%rbp
+ movq %rbp,88(%rsp)
+ vmovdqu %xmm5,240(%rsp)
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r12
+ cmpl %edx,%ecx
+ movq 8(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 24(%rdi),%xmm6
+ movl %ecx,48(%rsp)
+ cmovleq %rsp,%r12
+ subq %r12,%rbp
+ movq %rbp,96(%rsp)
+ vmovdqu %xmm6,256(%rsp)
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r13
+ cmpl %edx,%ecx
+ movq 48(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 64(%rdi),%xmm7
+ movl %ecx,52(%rsp)
+ cmovleq %rsp,%r13
+ subq %r13,%rbp
+ movq %rbp,104(%rsp)
+ vmovdqu %xmm7,272(%rsp)
+ movl 96(%rdi),%ecx
+ movq 80(%rdi),%r14
+ cmpl %edx,%ecx
+ movq 88(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 104(%rdi),%xmm8
+ movl %ecx,56(%rsp)
+ cmovleq %rsp,%r14
+ subq %r14,%rbp
+ movq %rbp,112(%rsp)
+ vmovdqu %xmm8,288(%rsp)
+ movl 136(%rdi),%ecx
+ movq 120(%rdi),%r15
+ cmpl %edx,%ecx
+ movq 128(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 144(%rdi),%xmm9
+ movl %ecx,60(%rsp)
+ cmovleq %rsp,%r15
+ subq %r15,%rbp
+ movq %rbp,120(%rsp)
+ vmovdqu %xmm9,304(%rsp)
+ testl %edx,%edx
+ jz .Ldec8x_done
+
+ vmovups 16-120(%rsi),%xmm1
+ vmovups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+ leaq 192+128(%rsp),%rbp
+
+ vmovdqu (%r8),%xmm2
+ vmovdqu (%r9),%xmm3
+ vmovdqu (%r10),%xmm4
+ vmovdqu (%r11),%xmm5
+ vmovdqu (%r12),%xmm6
+ vmovdqu (%r13),%xmm7
+ vmovdqu (%r14),%xmm8
+ vmovdqu (%r15),%xmm9
+ vmovdqu %xmm2,0(%rbp)
+ vpxor %xmm15,%xmm2,%xmm2
+ vmovdqu %xmm3,16(%rbp)
+ vpxor %xmm15,%xmm3,%xmm3
+ vmovdqu %xmm4,32(%rbp)
+ vpxor %xmm15,%xmm4,%xmm4
+ vmovdqu %xmm5,48(%rbp)
+ vpxor %xmm15,%xmm5,%xmm5
+ vmovdqu %xmm6,64(%rbp)
+ vpxor %xmm15,%xmm6,%xmm6
+ vmovdqu %xmm7,80(%rbp)
+ vpxor %xmm15,%xmm7,%xmm7
+ vmovdqu %xmm8,96(%rbp)
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu %xmm9,112(%rbp)
+ vpxor %xmm15,%xmm9,%xmm9
+ xorq $0x80,%rbp
+ movl $1,%ecx
+ jmp .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+0(%rsp),%ecx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r8)
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r8,%rbx,1),%rbx
+ cmovgeq %rsp,%r8
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r8,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r8),%xmm10
+ movq %rbx,64+0(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -72(%rsi),%xmm1
+ leaq 16(%r8,%rbx,1),%r8
+ vmovdqu %xmm10,128(%rsp)
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+4(%rsp),%ecx
+ movq 64+8(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r9)
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r9,%rbx,1),%rbx
+ cmovgeq %rsp,%r9
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r9,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r9),%xmm11
+ movq %rbx,64+8(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups -56(%rsi),%xmm0
+ leaq 16(%r9,%rbx,1),%r9
+ vmovdqu %xmm11,144(%rsp)
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+8(%rsp),%ecx
+ movq 64+16(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r10)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r8)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r10,%rbx,1),%rbx
+ cmovgeq %rsp,%r10
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r10,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r10),%xmm12
+ movq %rbx,64+16(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -40(%rsi),%xmm1
+ leaq 16(%r10,%rbx,1),%r10
+ vmovdqu %xmm12,160(%rsp)
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+12(%rsp),%ecx
+ movq 64+24(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r11)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r9)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r11,%rbx,1),%rbx
+ cmovgeq %rsp,%r11
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r11,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r11),%xmm13
+ movq %rbx,64+24(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups -24(%rsi),%xmm0
+ leaq 16(%r11,%rbx,1),%r11
+ vmovdqu %xmm13,176(%rsp)
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+16(%rsp),%ecx
+ movq 64+32(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r12)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r10)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r12,%rbx,1),%rbx
+ cmovgeq %rsp,%r12
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r12,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r12),%xmm10
+ movq %rbx,64+32(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -8(%rsi),%xmm1
+ leaq 16(%r12,%rbx,1),%r12
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+20(%rsp),%ecx
+ movq 64+40(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r13)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r11)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%rbx,%r13,1),%rbx
+ cmovgeq %rsp,%r13
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r13,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r13),%xmm11
+ movq %rbx,64+40(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 8(%rsi),%xmm0
+ leaq 16(%r13,%rbx,1),%r13
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+24(%rsp),%ecx
+ movq 64+48(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r14)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r12)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r14,%rbx,1),%rbx
+ cmovgeq %rsp,%r14
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r14,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r14),%xmm12
+ movq %rbx,64+48(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 24(%rsi),%xmm1
+ leaq 16(%r14,%rbx,1),%r14
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+28(%rsp),%ecx
+ movq 64+56(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r15)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r13)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r15,%rbx,1),%rbx
+ cmovgeq %rsp,%r15
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r15,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r15),%xmm13
+ movq %rbx,64+56(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 40(%rsi),%xmm0
+ leaq 16(%r15,%rbx,1),%r15
+ vmovdqu 32(%rsp),%xmm14
+ prefetcht0 15(%r14)
+ prefetcht0 15(%r15)
+ cmpl $11,%eax
+ jb .Ldec8x_tail
+
+ vaesdec %xmm1,%xmm2,%xmm2
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vaesdec %xmm1,%xmm7,%xmm7
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 176-120(%rsi),%xmm1
+
+ vaesdec %xmm0,%xmm2,%xmm2
+ vaesdec %xmm0,%xmm3,%xmm3
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ vaesdec %xmm0,%xmm6,%xmm6
+ vaesdec %xmm0,%xmm7,%xmm7
+ vaesdec %xmm0,%xmm8,%xmm8
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 192-120(%rsi),%xmm0
+ je .Ldec8x_tail
+
+ vaesdec %xmm1,%xmm2,%xmm2
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vaesdec %xmm1,%xmm7,%xmm7
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 208-120(%rsi),%xmm1
+
+ vaesdec %xmm0,%xmm2,%xmm2
+ vaesdec %xmm0,%xmm3,%xmm3
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ vaesdec %xmm0,%xmm6,%xmm6
+ vaesdec %xmm0,%xmm7,%xmm7
+ vaesdec %xmm0,%xmm8,%xmm8
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 224-120(%rsi),%xmm0
+
+.Ldec8x_tail:
+ vaesdec %xmm1,%xmm2,%xmm2
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vpaddd %xmm14,%xmm15,%xmm15
+ vmovdqu 48(%rsp),%xmm14
+ vaesdec %xmm1,%xmm7,%xmm7
+ movq 64(%rsp),%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 16-120(%rsi),%xmm1
+
+ vaesdeclast %xmm0,%xmm2,%xmm2
+ vmovdqa %xmm15,32(%rsp)
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesdeclast %xmm0,%xmm3,%xmm3
+ vpxor 0(%rbp),%xmm2,%xmm2
+ vaesdeclast %xmm0,%xmm4,%xmm4
+ vpxor 16(%rbp),%xmm3,%xmm3
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesdeclast %xmm0,%xmm5,%xmm5
+ vpxor 32(%rbp),%xmm4,%xmm4
+ vaesdeclast %xmm0,%xmm6,%xmm6
+ vpxor 48(%rbp),%xmm5,%xmm5
+ vpaddd %xmm15,%xmm14,%xmm14
+ vmovdqu -120(%rsi),%xmm15
+ vaesdeclast %xmm0,%xmm7,%xmm7
+ vpxor 64(%rbp),%xmm6,%xmm6
+ vaesdeclast %xmm0,%xmm8,%xmm8
+ vpxor 80(%rbp),%xmm7,%xmm7
+ vmovdqa %xmm14,48(%rsp)
+ vaesdeclast %xmm0,%xmm9,%xmm9
+ vpxor 96(%rbp),%xmm8,%xmm8
+ vmovups 32-120(%rsi),%xmm0
+
+ vmovups %xmm2,-16(%r8)
+ subq %rbx,%r8
+ vmovdqu 128+0(%rsp),%xmm2
+ vpxor 112(%rbp),%xmm9,%xmm9
+ vmovups %xmm3,-16(%r9)
+ subq 72(%rsp),%r9
+ vmovdqu %xmm2,0(%rbp)
+ vpxor %xmm15,%xmm2,%xmm2
+ vmovdqu 128+16(%rsp),%xmm3
+ vmovups %xmm4,-16(%r10)
+ subq 80(%rsp),%r10
+ vmovdqu %xmm3,16(%rbp)
+ vpxor %xmm15,%xmm3,%xmm3
+ vmovdqu 128+32(%rsp),%xmm4
+ vmovups %xmm5,-16(%r11)
+ subq 88(%rsp),%r11
+ vmovdqu %xmm4,32(%rbp)
+ vpxor %xmm15,%xmm4,%xmm4
+ vmovdqu 128+48(%rsp),%xmm5
+ vmovups %xmm6,-16(%r12)
+ subq 96(%rsp),%r12
+ vmovdqu %xmm5,48(%rbp)
+ vpxor %xmm15,%xmm5,%xmm5
+ vmovdqu %xmm10,64(%rbp)
+ vpxor %xmm10,%xmm15,%xmm6
+ vmovups %xmm7,-16(%r13)
+ subq 104(%rsp),%r13
+ vmovdqu %xmm11,80(%rbp)
+ vpxor %xmm11,%xmm15,%xmm7
+ vmovups %xmm8,-16(%r14)
+ subq 112(%rsp),%r14
+ vmovdqu %xmm12,96(%rbp)
+ vpxor %xmm12,%xmm15,%xmm8
+ vmovups %xmm9,-16(%r15)
+ subq 120(%rsp),%r15
+ vmovdqu %xmm13,112(%rbp)
+ vpxor %xmm13,%xmm15,%xmm9
+
+ xorq $128,%rbp
+ decl %edx
+ jnz .Loop_dec8x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+
+
+
+
+
+.Ldec8x_done:
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Ldec8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
index 294db310a06a..38f306142c82 100644
--- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
@@ -13,6 +13,11 @@ aesni_cbc_sha1_enc:
movq OPENSSL_ia32cap_P+4(%rip),%r11
btq $61,%r11
jc aesni_cbc_sha1_enc_shaext
+ andl $268435456,%r11d
+ andl $1073741824,%r10d
+ orl %r11d,%r10d
+ cmpl $1342177280,%r10d
+ je aesni_cbc_sha1_enc_avx
jmp aesni_cbc_sha1_enc_ssse3
.byte 0xf3,0xc3
.cfi_endproc
@@ -1394,6 +1399,1327 @@ aesni_cbc_sha1_enc_ssse3:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.type aesni_cbc_sha1_enc_avx,@function
+.align 32
+aesni_cbc_sha1_enc_avx:
+.cfi_startproc
+ movq 8(%rsp),%r10
+
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ leaq -104(%rsp),%rsp
+.cfi_adjust_cfa_offset 104
+
+
+ vzeroall
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ leaq 112(%rcx),%r15
+ vmovdqu (%r8),%xmm12
+ movq %r8,88(%rsp)
+ shlq $6,%r14
+ subq %r12,%r13
+ movl 240-112(%r15),%r8d
+ addq %r10,%r14
+
+ leaq K_XX_XX(%rip),%r11
+ movl 0(%r9),%eax
+ movl 4(%r9),%ebx
+ movl 8(%r9),%ecx
+ movl 12(%r9),%edx
+ movl %ebx,%esi
+ movl 16(%r9),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa 0(%r11),%xmm10
+ vmovdqu 0(%r10),%xmm0
+ vmovdqu 16(%r10),%xmm1
+ vmovdqu 32(%r10),%xmm2
+ vmovdqu 48(%r10),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r10
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm10,%xmm0,%xmm4
+ vpaddd %xmm10,%xmm1,%xmm5
+ vpaddd %xmm10,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ jmp .Loop_avx
+.align 32
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ vmovdqu 0(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm10,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm9
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpor %xmm8,%xmm4,%xmm4
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ vpxor %xmm9,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm10,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm9
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpor %xmm8,%xmm5,%xmm5
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm9,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa 16(%r11),%xmm10
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ vpaddd %xmm5,%xmm10,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm9
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpor %xmm8,%xmm6,%xmm6
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm9,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm10,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm9
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpor %xmm8,%xmm7,%xmm7
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ cmpl $11,%r8d
+ jb .Lvaesenclast6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast6:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm9,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm10,%xmm9
+ addl %esi,%edx
+ vmovdqu 16(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vmovups %xmm12,0(%r12,%r13,1)
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm10,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm10,%xmm9
+ vmovdqa 32(%r11),%xmm10
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm10,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm10,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm10,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ cmpl $11,%r8d
+ jb .Lvaesenclast7
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast7
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast7:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ vmovdqu 32(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vmovups %xmm12,16(%r13,%r12,1)
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm10,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm10,%xmm9
+ vmovdqa 48(%r11),%xmm10
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm10,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm10,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ cmpl $11,%r8d
+ jb .Lvaesenclast8
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast8
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast8:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm10,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vmovdqu 48(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vmovups %xmm12,32(%r13,%r12,1)
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm10,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm10,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r14,%r10
+ je .Ldone_avx
+ vmovdqa 64(%r11),%xmm9
+ vmovdqa 0(%r11),%xmm10
+ vmovdqu 0(%r10),%xmm0
+ vmovdqu 16(%r10),%xmm1
+ vmovdqu 32(%r10),%xmm2
+ vmovdqu 48(%r10),%xmm3
+ vpshufb %xmm9,%xmm0,%xmm0
+ addq $64,%r10
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm9,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm10,%xmm0,%xmm8
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm8,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm9,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm10,%xmm1,%xmm8
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm8,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm9,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm10,%xmm2,%xmm8
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm8,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ cmpl $11,%r8d
+ jb .Lvaesenclast9
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast9
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast9:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vmovups %xmm12,48(%r13,%r12,1)
+ leaq 64(%r12),%r12
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ addl 12(%r9),%edx
+ movl %eax,0(%r9)
+ addl 16(%r9),%ebp
+ movl %esi,4(%r9)
+ movl %esi,%ebx
+ movl %ecx,8(%r9)
+ movl %ecx,%edi
+ movl %edx,12(%r9)
+ xorl %edx,%edi
+ movl %ebp,16(%r9)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ cmpl $11,%r8d
+ jb .Lvaesenclast10
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast10
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast10:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vmovups %xmm12,48(%r13,%r12,1)
+ movq 88(%rsp),%r8
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ movl %eax,0(%r9)
+ addl 12(%r9),%edx
+ movl %esi,4(%r9)
+ addl 16(%r9),%ebp
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ vmovups %xmm12,(%r8)
+ vzeroall
+ leaq 104(%rsp),%rsi
+.cfi_def_cfa %rsi,56
+ movq 0(%rsi),%r15
+.cfi_restore %r15
+ movq 8(%rsi),%r14
+.cfi_restore %r14
+ movq 16(%rsi),%r13
+.cfi_restore %r13
+ movq 24(%rsi),%r12
+.cfi_restore %r12
+ movq 32(%rsi),%rbp
+.cfi_restore %rbp
+ movq 40(%rsi),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsi),%rsp
+.cfi_def_cfa %rsp,8
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -1485,17 +2811,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm3,%xmm5
.byte 15,56,201,243
cmpl $11,%r11d
- jb .Laesenclast6
+ jb .Laesenclast11
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast6
+ je .Laesenclast11
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast6:
+.Laesenclast11:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -1551,17 +2877,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm4,%xmm6
.byte 15,56,201,220
cmpl $11,%r11d
- jb .Laesenclast7
+ jb .Laesenclast12
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast7
+ je .Laesenclast12
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast7:
+.Laesenclast12:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm9
@@ -1617,17 +2943,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm5,%xmm3
.byte 15,56,201,229
cmpl $11,%r11d
- jb .Laesenclast8
+ jb .Laesenclast13
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast8
+ je .Laesenclast13
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast8:
+.Laesenclast13:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -1681,17 +3007,17 @@ aesni_cbc_sha1_enc_shaext:
movups 48(%rcx),%xmm1
.byte 102,15,56,220,208
cmpl $11,%r11d
- jb .Laesenclast9
+ jb .Laesenclast14
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast9
+ je .Laesenclast14
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast9:
+.Laesenclast14:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
decq %rdx
diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
index e42a02ebe647..cb9e150db553 100644
--- a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
@@ -8,6 +8,25 @@
.align 16
aesni_cbc_sha256_enc:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl $1,%eax
+ cmpq $0,%rdi
+ je .Lprobe
+ movl 0(%r11),%eax
+ movq 4(%r11),%r10
+ btq $61,%r10
+ jc aesni_cbc_sha256_enc_shaext
+ movq %r10,%r11
+ shrq $32,%r11
+
+ testl $2048,%r10d
+ jnz aesni_cbc_sha256_enc_xop
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je aesni_cbc_sha256_enc_avx2
+ andl $268435456,%r10d
+ jnz aesni_cbc_sha256_enc_avx
+ ud2
xorl %eax,%eax
cmpq $0,%rdi
je .Lprobe
@@ -59,3 +78,4360 @@ K256:
.long 0,0,0,0, 0,0,0,0
.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
+.type aesni_cbc_sha256_enc_xop,@function
+.align 64
+aesni_cbc_sha256_enc_xop:
+.cfi_startproc
+.Lxop_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_xop:
+ vzeroall
+
+ movq %rdi,%r12
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r13
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ subq $9,%r14
+
+ movl 0(%r15),%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+
+ vmovdqa 0(%r13,%r14,8),%xmm14
+ vmovdqa 16(%r13,%r14,8),%xmm13
+ vmovdqa 32(%r13,%r14,8),%xmm12
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Lloop_xop
+.align 16
+.Lloop_xop:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi,%r12,1),%xmm0
+ vmovdqu 16(%rsi,%r12,1),%xmm1
+ vmovdqu 32(%rsi,%r12,1),%xmm2
+ vmovdqu 48(%rsi,%r12,1),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%esi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ subq $-32*4,%rbp
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%eax
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ vpaddd %xmm7,%xmm0,%xmm0
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+.byte 143,232,120,194,251,13
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ vpsrld $10,%xmm3,%xmm6
+ rorl $2,%r14d
+ addl %esi,%r11d
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %edx,%r13d
+ addl %r11d,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+.byte 143,232,120,194,248,13
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ vpsrld $10,%xmm0,%xmm6
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ vpaddd %xmm7,%xmm1,%xmm1
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+.byte 143,232,120,194,248,13
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ vpsrld $10,%xmm0,%xmm6
+ rorl $2,%r14d
+ addl %esi,%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %r11d,%r13d
+ addl %edx,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%edx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+.byte 143,232,120,194,249,13
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ vpsrld $10,%xmm1,%xmm6
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%eax
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ vpaddd %xmm7,%xmm2,%xmm2
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+.byte 143,232,120,194,249,13
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ vpsrld $10,%xmm1,%xmm6
+ rorl $2,%r14d
+ addl %esi,%r11d
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %edx,%r13d
+ addl %r11d,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+.byte 143,232,120,194,250,13
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ vpsrld $10,%xmm2,%xmm6
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ vpaddd %xmm7,%xmm3,%xmm3
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+.byte 143,232,120,194,250,13
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ vpsrld $10,%xmm2,%xmm6
+ rorl $2,%r14d
+ addl %esi,%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %r11d,%r13d
+ addl %edx,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%edx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+.byte 143,232,120,194,251,13
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ vpsrld $10,%xmm3,%xmm6
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ movq 64+0(%rsp),%r12
+ vpand %xmm14,%xmm11,%xmm11
+ movq 64+8(%rsp),%r15
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r12,1)
+ leaq 16(%r12),%r12
+ cmpb $0,131(%rbp)
+ jne .Lxop_00_47
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ rorl $2,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ rorl $2,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ rorl $2,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ rorl $2,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%r12
+ movq 64+8(%rsp),%r13
+ movq 64+40(%rsp),%r15
+ movq 64+48(%rsp),%rsi
+
+ vpand %xmm14,%xmm11,%xmm11
+ movl %r14d,%eax
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r12
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ jb .Lloop_xop
+
+ movq 64+32(%rsp),%r8
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_xop:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
+.type aesni_cbc_sha256_enc_avx,@function
+.align 64
+aesni_cbc_sha256_enc_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+ vzeroall
+
+ movq %rdi,%r12
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r13
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ subq $9,%r14
+
+ movl 0(%r15),%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+
+ vmovdqa 0(%r13,%r14,8),%xmm14
+ vmovdqa 16(%r13,%r14,8),%xmm13
+ vmovdqa 32(%r13,%r14,8),%xmm12
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi,%r12,1),%xmm0
+ vmovdqu 16(%rsi,%r12,1),%xmm1
+ vmovdqu 32(%rsi,%r12,1),%xmm2
+ vmovdqu 48(%rsi,%r12,1),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%esi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-32*4,%rbp
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ vpshufd $250,%xmm3,%xmm7
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpaddd %xmm6,%xmm0,%xmm0
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ vpshufd $80,%xmm0,%xmm7
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ vpshufd $250,%xmm0,%xmm7
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpaddd %xmm6,%xmm1,%xmm1
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ vpshufd $80,%xmm1,%xmm7
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ vpshufd $250,%xmm1,%xmm7
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpaddd %xmm6,%xmm2,%xmm2
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ vpshufd $80,%xmm2,%xmm7
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ vpshufd $250,%xmm2,%xmm7
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpaddd %xmm6,%xmm3,%xmm3
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ vpshufd $80,%xmm3,%xmm7
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ movq 64+0(%rsp),%r12
+ vpand %xmm14,%xmm11,%xmm11
+ movq 64+8(%rsp),%r15
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r12,1)
+ leaq 16(%r12),%r12
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%r12
+ movq 64+8(%rsp),%r13
+ movq 64+40(%rsp),%r15
+ movq 64+48(%rsp),%rsi
+
+ vpand %xmm14,%xmm11,%xmm11
+ movl %r14d,%eax
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r12
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+ jb .Lloop_avx
+
+ movq 64+32(%rsp),%r8
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
+.type aesni_cbc_sha256_enc_avx2,@function
+.align 64
+aesni_cbc_sha256_enc_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-1024,%rsp
+ addq $448,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+ vzeroall
+
+ movq %rdi,%r13
+ vpinsrq $1,%rsi,%xmm15,%xmm15
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r12
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ leaq -9(%r14),%r14
+
+ vmovdqa 0(%r12,%r14,8),%xmm14
+ vmovdqa 16(%r12,%r14,8),%xmm13
+ vmovdqa 32(%r12,%r14,8),%xmm12
+
+ subq $-64,%r13
+ movl 0(%r15),%eax
+ leaq (%rsi,%r13,1),%r12
+ movl 4(%r15),%ebx
+ cmpq %rdx,%r13
+ movl 8(%r15),%ecx
+ cmoveq %rsp,%r12
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa K256+512(%rip),%ymm7
+ vmovdqu -64+0(%rsi,%r13,1),%xmm0
+ vmovdqu -64+16(%rsi,%r13,1),%xmm1
+ vmovdqu -64+32(%rsi,%r13,1),%xmm2
+ vmovdqu -64+48(%rsi,%r13,1),%xmm3
+
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm7,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm7,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+
+ leaq K256(%rip),%rbp
+ vpshufb %ymm7,%ymm2,%ymm2
+ leaq -64(%r13),%r13
+ vpaddd 0(%rbp),%ymm0,%ymm4
+ vpshufb %ymm7,%ymm3,%ymm3
+ vpaddd 32(%rbp),%ymm1,%ymm5
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ vpaddd 96(%rbp),%ymm3,%ymm7
+ vmovdqa %ymm4,0(%rsp)
+ xorl %r14d,%r14d
+ vmovdqa %ymm5,32(%rsp)
+
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ leaq -64(%rsp),%rsp
+
+
+
+ movq %rsi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ movl %ebx,%esi
+ vmovdqa %ymm6,0(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %ymm7,32(%rsp)
+ movl %r9d,%r12d
+ subq $-32*4,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm0,%ymm1,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm2,%ymm3,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm0,%ymm0
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ vpshufd $250,%ymm3,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm0,%ymm0
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpshufd $80,%ymm0,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ vpaddd %ymm6,%ymm0,%ymm0
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ vpaddd 0(%rbp),%ymm0,%ymm6
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm1,%ymm2,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm3,%ymm0,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm1,%ymm1
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ vpshufd $250,%ymm0,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm1,%ymm1
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpshufd $80,%ymm1,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ vpaddd %ymm6,%ymm1,%ymm1
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ vpaddd 32(%rbp),%ymm1,%ymm6
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm2,%ymm3,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm0,%ymm1,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm2,%ymm2
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ vpshufd $250,%ymm1,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm2,%ymm2
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpshufd $80,%ymm2,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ vpaddd %ymm6,%ymm2,%ymm2
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm3,%ymm0,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm1,%ymm2,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm3,%ymm3
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ vpshufd $250,%ymm2,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm3,%ymm3
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpshufd $80,%ymm3,%ymm7
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ vpaddd %ymm6,%ymm3,%ymm3
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ vpaddd 96(%rbp),%ymm3,%ymm6
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ vmovq %xmm15,%r13
+ vpextrq $1,%xmm15,%r15
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r13,1)
+ leaq 16(%r13),%r13
+ leaq 128(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jne .Lavx2_00_47
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ addl 0+64(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+64(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+64(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+64(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+64(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+64(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+64(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+64(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ addl 0(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vpextrq $1,%xmm15,%r12
+ vmovq %xmm15,%r13
+ movq 552(%rsp),%r15
+ addl %r14d,%eax
+ leaq 448(%rsp),%rbp
+
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r13),%r13
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ cmpq 80(%rbp),%r13
+ je .Ldone_avx2
+
+ xorl %r14d,%r14d
+ movl %ebx,%esi
+ movl %r9d,%r12d
+ xorl %ecx,%esi
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ leaq -64(%rbp),%rbp
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovq %xmm15,%r13
+ vpextrq $1,%xmm15,%r15
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ leaq -64(%rbp),%rbp
+ vmovdqu %xmm8,(%r15,%r13,1)
+ leaq 16(%r13),%r13
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 552(%rsp),%r15
+ leaq 64(%r13),%r13
+ movq 560(%rsp),%rsi
+ addl %r14d,%eax
+ leaq 448(%rsp),%rsp
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ leaq (%rsi,%r13,1),%r12
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r13
+
+ movl %eax,0(%r15)
+ cmoveq %rsp,%r12
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+
+.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 64+32(%rbp),%r8
+ movq 64+56(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
+.type aesni_cbc_sha256_enc_shaext,@function
+.align 32
+aesni_cbc_sha256_enc_shaext:
+.cfi_startproc
+ movq 8(%rsp),%r10
+ leaq K256+128(%rip),%rax
+ movdqu (%r9),%xmm1
+ movdqu 16(%r9),%xmm2
+ movdqa 512-128(%rax),%xmm3
+
+ movl 240(%rcx),%r11d
+ subq %rdi,%rsi
+ movups (%rcx),%xmm15
+ movups (%r8),%xmm6
+ movups 16(%rcx),%xmm4
+ leaq 112(%rcx),%rcx
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%r10),%xmm10
+ movdqu 16(%r10),%xmm11
+ movdqu 32(%r10),%xmm12
+.byte 102,68,15,56,0,211
+ movdqu 48(%r10),%xmm13
+
+ movdqa 0-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 102,68,15,56,0,219
+ movdqa %xmm2,%xmm9
+ movdqa %xmm1,%xmm8
+ movups 0(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 32-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 102,68,15,56,0,227
+ leaq 64(%r10),%r10
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 64-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 102,68,15,56,0,235
+.byte 69,15,56,204,211
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 96-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+.byte 15,56,203,202
+ movdqa 128-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ cmpl $11,%r11d
+ jb .Laesenclast1
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast1
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast1:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,202
+ movups 16(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,0(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movdqa 160-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 192-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+.byte 69,15,56,204,211
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 224-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 256-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ cmpl $11,%r11d
+ jb .Laesenclast2
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast2
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast2:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,202
+ movups 32(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,16(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movdqa 288-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 320-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+.byte 69,15,56,204,211
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 352-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 384-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 416-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ cmpl $11,%r11d
+ jb .Laesenclast3
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast3
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast3:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups 48(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,32(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 448-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+ movdqa %xmm7,%xmm3
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 480-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ cmpl $11,%r11d
+ jb .Laesenclast4
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast4
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast4:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+
+ paddd %xmm9,%xmm2
+ paddd %xmm8,%xmm1
+
+ decq %rdx
+ movups %xmm6,48(%rsi,%rdi,1)
+ leaq 64(%rdi),%rdi
+ jnz .Loop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm3
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,211,8
+
+ movups %xmm6,(%r8)
+ movdqu %xmm1,(%r9)
+ movdqu %xmm2,16(%r9)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext
diff --git a/secure/lib/libcrypto/amd64/chacha-x86_64.S b/secure/lib/libcrypto/amd64/chacha-x86_64.S
index 0b3d5b8b6db4..b01c1b87d47b 100644
--- a/secure/lib/libcrypto/amd64/chacha-x86_64.S
+++ b/secure/lib/libcrypto/amd64/chacha-x86_64.S
@@ -331,6 +331,8 @@ ChaCha20_ssse3:
.LChaCha20_ssse3:
movq %rsp,%r9
.cfi_def_cfa_register %r9
+ testl $2048,%r10d
+ jnz .LChaCha20_4xop
cmpq $128,%rdx
je .LChaCha20_128
ja .LChaCha20_4x
@@ -626,6 +628,9 @@ ChaCha20_4x:
movq %rsp,%r9
.cfi_def_cfa_register %r9
movq %r10,%r11
+ shrq $32,%r10
+ testq $32,%r10
+ jnz .LChaCha20_8x
cmpq $192,%rdx
ja .Lproceed4x
@@ -1167,3 +1172,1024 @@ ChaCha20_4x:
.byte 0xf3,0xc3
.cfi_endproc
.size ChaCha20_4x,.-ChaCha20_4x
+.type ChaCha20_4xop,@function
+.align 32
+ChaCha20_4xop:
+.cfi_startproc
+.LChaCha20_4xop:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $0x140+8,%rsp
+ vzeroupper
+
+ vmovdqa .Lsigma(%rip),%xmm11
+ vmovdqu (%rcx),%xmm3
+ vmovdqu 16(%rcx),%xmm15
+ vmovdqu (%r8),%xmm7
+ leaq 256(%rsp),%rcx
+
+ vpshufd $0x00,%xmm11,%xmm8
+ vpshufd $0x55,%xmm11,%xmm9
+ vmovdqa %xmm8,64(%rsp)
+ vpshufd $0xaa,%xmm11,%xmm10
+ vmovdqa %xmm9,80(%rsp)
+ vpshufd $0xff,%xmm11,%xmm11
+ vmovdqa %xmm10,96(%rsp)
+ vmovdqa %xmm11,112(%rsp)
+
+ vpshufd $0x00,%xmm3,%xmm0
+ vpshufd $0x55,%xmm3,%xmm1
+ vmovdqa %xmm0,128-256(%rcx)
+ vpshufd $0xaa,%xmm3,%xmm2
+ vmovdqa %xmm1,144-256(%rcx)
+ vpshufd $0xff,%xmm3,%xmm3
+ vmovdqa %xmm2,160-256(%rcx)
+ vmovdqa %xmm3,176-256(%rcx)
+
+ vpshufd $0x00,%xmm15,%xmm12
+ vpshufd $0x55,%xmm15,%xmm13
+ vmovdqa %xmm12,192-256(%rcx)
+ vpshufd $0xaa,%xmm15,%xmm14
+ vmovdqa %xmm13,208-256(%rcx)
+ vpshufd $0xff,%xmm15,%xmm15
+ vmovdqa %xmm14,224-256(%rcx)
+ vmovdqa %xmm15,240-256(%rcx)
+
+ vpshufd $0x00,%xmm7,%xmm4
+ vpshufd $0x55,%xmm7,%xmm5
+ vpaddd .Linc(%rip),%xmm4,%xmm4
+ vpshufd $0xaa,%xmm7,%xmm6
+ vmovdqa %xmm5,272-256(%rcx)
+ vpshufd $0xff,%xmm7,%xmm7
+ vmovdqa %xmm6,288-256(%rcx)
+ vmovdqa %xmm7,304-256(%rcx)
+
+ jmp .Loop_enter4xop
+
+.align 32
+.Loop_outer4xop:
+ vmovdqa 64(%rsp),%xmm8
+ vmovdqa 80(%rsp),%xmm9
+ vmovdqa 96(%rsp),%xmm10
+ vmovdqa 112(%rsp),%xmm11
+ vmovdqa 128-256(%rcx),%xmm0
+ vmovdqa 144-256(%rcx),%xmm1
+ vmovdqa 160-256(%rcx),%xmm2
+ vmovdqa 176-256(%rcx),%xmm3
+ vmovdqa 192-256(%rcx),%xmm12
+ vmovdqa 208-256(%rcx),%xmm13
+ vmovdqa 224-256(%rcx),%xmm14
+ vmovdqa 240-256(%rcx),%xmm15
+ vmovdqa 256-256(%rcx),%xmm4
+ vmovdqa 272-256(%rcx),%xmm5
+ vmovdqa 288-256(%rcx),%xmm6
+ vmovdqa 304-256(%rcx),%xmm7
+ vpaddd .Lfour(%rip),%xmm4,%xmm4
+
+.Loop_enter4xop:
+ movl $10,%eax
+ vmovdqa %xmm4,256-256(%rcx)
+ jmp .Loop4xop
+
+.align 32
+.Loop4xop:
+ vpaddd %xmm0,%xmm8,%xmm8
+ vpaddd %xmm1,%xmm9,%xmm9
+ vpaddd %xmm2,%xmm10,%xmm10
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor %xmm4,%xmm8,%xmm4
+ vpxor %xmm5,%xmm9,%xmm5
+ vpxor %xmm6,%xmm10,%xmm6
+ vpxor %xmm7,%xmm11,%xmm7
+.byte 143,232,120,194,228,16
+.byte 143,232,120,194,237,16
+.byte 143,232,120,194,246,16
+.byte 143,232,120,194,255,16
+ vpaddd %xmm4,%xmm12,%xmm12
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm15,%xmm15
+ vpxor %xmm0,%xmm12,%xmm0
+ vpxor %xmm1,%xmm13,%xmm1
+ vpxor %xmm14,%xmm2,%xmm2
+ vpxor %xmm15,%xmm3,%xmm3
+.byte 143,232,120,194,192,12
+.byte 143,232,120,194,201,12
+.byte 143,232,120,194,210,12
+.byte 143,232,120,194,219,12
+ vpaddd %xmm8,%xmm0,%xmm8
+ vpaddd %xmm9,%xmm1,%xmm9
+ vpaddd %xmm2,%xmm10,%xmm10
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor %xmm4,%xmm8,%xmm4
+ vpxor %xmm5,%xmm9,%xmm5
+ vpxor %xmm6,%xmm10,%xmm6
+ vpxor %xmm7,%xmm11,%xmm7
+.byte 143,232,120,194,228,8
+.byte 143,232,120,194,237,8
+.byte 143,232,120,194,246,8
+.byte 143,232,120,194,255,8
+ vpaddd %xmm4,%xmm12,%xmm12
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm15,%xmm15
+ vpxor %xmm0,%xmm12,%xmm0
+ vpxor %xmm1,%xmm13,%xmm1
+ vpxor %xmm14,%xmm2,%xmm2
+ vpxor %xmm15,%xmm3,%xmm3
+.byte 143,232,120,194,192,7
+.byte 143,232,120,194,201,7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,219,7
+ vpaddd %xmm1,%xmm8,%xmm8
+ vpaddd %xmm2,%xmm9,%xmm9
+ vpaddd %xmm3,%xmm10,%xmm10
+ vpaddd %xmm0,%xmm11,%xmm11
+ vpxor %xmm7,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm4
+ vpxor %xmm5,%xmm10,%xmm5
+ vpxor %xmm6,%xmm11,%xmm6
+.byte 143,232,120,194,255,16
+.byte 143,232,120,194,228,16
+.byte 143,232,120,194,237,16
+.byte 143,232,120,194,246,16
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpaddd %xmm4,%xmm15,%xmm15
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpxor %xmm1,%xmm14,%xmm1
+ vpxor %xmm2,%xmm15,%xmm2
+ vpxor %xmm12,%xmm3,%xmm3
+ vpxor %xmm13,%xmm0,%xmm0
+.byte 143,232,120,194,201,12
+.byte 143,232,120,194,210,12
+.byte 143,232,120,194,219,12
+.byte 143,232,120,194,192,12
+ vpaddd %xmm8,%xmm1,%xmm8
+ vpaddd %xmm9,%xmm2,%xmm9
+ vpaddd %xmm3,%xmm10,%xmm10
+ vpaddd %xmm0,%xmm11,%xmm11
+ vpxor %xmm7,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm4
+ vpxor %xmm5,%xmm10,%xmm5
+ vpxor %xmm6,%xmm11,%xmm6
+.byte 143,232,120,194,255,8
+.byte 143,232,120,194,228,8
+.byte 143,232,120,194,237,8
+.byte 143,232,120,194,246,8
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpaddd %xmm4,%xmm15,%xmm15
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpxor %xmm1,%xmm14,%xmm1
+ vpxor %xmm2,%xmm15,%xmm2
+ vpxor %xmm12,%xmm3,%xmm3
+ vpxor %xmm13,%xmm0,%xmm0
+.byte 143,232,120,194,201,7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,192,7
+ decl %eax
+ jnz .Loop4xop
+
+ vpaddd 64(%rsp),%xmm8,%xmm8
+ vpaddd 80(%rsp),%xmm9,%xmm9
+ vpaddd 96(%rsp),%xmm10,%xmm10
+ vpaddd 112(%rsp),%xmm11,%xmm11
+
+ vmovdqa %xmm14,32(%rsp)
+ vmovdqa %xmm15,48(%rsp)
+
+ vpunpckldq %xmm9,%xmm8,%xmm14
+ vpunpckldq %xmm11,%xmm10,%xmm15
+ vpunpckhdq %xmm9,%xmm8,%xmm8
+ vpunpckhdq %xmm11,%xmm10,%xmm10
+ vpunpcklqdq %xmm15,%xmm14,%xmm9
+ vpunpckhqdq %xmm15,%xmm14,%xmm14
+ vpunpcklqdq %xmm10,%xmm8,%xmm11
+ vpunpckhqdq %xmm10,%xmm8,%xmm8
+ vpaddd 128-256(%rcx),%xmm0,%xmm0
+ vpaddd 144-256(%rcx),%xmm1,%xmm1
+ vpaddd 160-256(%rcx),%xmm2,%xmm2
+ vpaddd 176-256(%rcx),%xmm3,%xmm3
+
+ vmovdqa %xmm9,0(%rsp)
+ vmovdqa %xmm14,16(%rsp)
+ vmovdqa 32(%rsp),%xmm9
+ vmovdqa 48(%rsp),%xmm14
+
+ vpunpckldq %xmm1,%xmm0,%xmm10
+ vpunpckldq %xmm3,%xmm2,%xmm15
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm15,%xmm10,%xmm1
+ vpunpckhqdq %xmm15,%xmm10,%xmm10
+ vpunpcklqdq %xmm2,%xmm0,%xmm3
+ vpunpckhqdq %xmm2,%xmm0,%xmm0
+ vpaddd 192-256(%rcx),%xmm12,%xmm12
+ vpaddd 208-256(%rcx),%xmm13,%xmm13
+ vpaddd 224-256(%rcx),%xmm9,%xmm9
+ vpaddd 240-256(%rcx),%xmm14,%xmm14
+
+ vpunpckldq %xmm13,%xmm12,%xmm2
+ vpunpckldq %xmm14,%xmm9,%xmm15
+ vpunpckhdq %xmm13,%xmm12,%xmm12
+ vpunpckhdq %xmm14,%xmm9,%xmm9
+ vpunpcklqdq %xmm15,%xmm2,%xmm13
+ vpunpckhqdq %xmm15,%xmm2,%xmm2
+ vpunpcklqdq %xmm9,%xmm12,%xmm14
+ vpunpckhqdq %xmm9,%xmm12,%xmm12
+ vpaddd 256-256(%rcx),%xmm4,%xmm4
+ vpaddd 272-256(%rcx),%xmm5,%xmm5
+ vpaddd 288-256(%rcx),%xmm6,%xmm6
+ vpaddd 304-256(%rcx),%xmm7,%xmm7
+
+ vpunpckldq %xmm5,%xmm4,%xmm9
+ vpunpckldq %xmm7,%xmm6,%xmm15
+ vpunpckhdq %xmm5,%xmm4,%xmm4
+ vpunpckhdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm15,%xmm9,%xmm5
+ vpunpckhqdq %xmm15,%xmm9,%xmm9
+ vpunpcklqdq %xmm6,%xmm4,%xmm7
+ vpunpckhqdq %xmm6,%xmm4,%xmm4
+ vmovdqa 0(%rsp),%xmm6
+ vmovdqa 16(%rsp),%xmm15
+
+ cmpq $256,%rdx
+ jb .Ltail4xop
+
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ vpxor 0(%rsi),%xmm11,%xmm11
+ vpxor 16(%rsi),%xmm3,%xmm3
+ vpxor 32(%rsi),%xmm14,%xmm14
+ vpxor 48(%rsi),%xmm7,%xmm7
+ vpxor 64(%rsi),%xmm8,%xmm8
+ vpxor 80(%rsi),%xmm0,%xmm0
+ vpxor 96(%rsi),%xmm12,%xmm12
+ vpxor 112(%rsi),%xmm4,%xmm4
+ leaq 128(%rsi),%rsi
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ leaq 128(%rdi),%rdi
+ vmovdqu %xmm11,0(%rdi)
+ vmovdqu %xmm3,16(%rdi)
+ vmovdqu %xmm14,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ vmovdqu %xmm8,64(%rdi)
+ vmovdqu %xmm0,80(%rdi)
+ vmovdqu %xmm12,96(%rdi)
+ vmovdqu %xmm4,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4xop
+
+ jmp .Ldone4xop
+
+.align 32
+.Ltail4xop:
+ cmpq $192,%rdx
+ jae .L192_or_more4xop
+ cmpq $128,%rdx
+ jae .L128_or_more4xop
+ cmpq $64,%rdx
+ jae .L64_or_more4xop
+
+ xorq %r10,%r10
+ vmovdqa %xmm6,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm13,32(%rsp)
+ vmovdqa %xmm5,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L64_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ je .Ldone4xop
+
+ leaq 64(%rsi),%rsi
+ vmovdqa %xmm15,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm10,16(%rsp)
+ leaq 64(%rdi),%rdi
+ vmovdqa %xmm2,32(%rsp)
+ subq $64,%rdx
+ vmovdqa %xmm9,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L128_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ je .Ldone4xop
+
+ leaq 128(%rsi),%rsi
+ vmovdqa %xmm11,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm3,16(%rsp)
+ leaq 128(%rdi),%rdi
+ vmovdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ vmovdqa %xmm7,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L192_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ vpxor 0(%rsi),%xmm11,%xmm11
+ vpxor 16(%rsi),%xmm3,%xmm3
+ vpxor 32(%rsi),%xmm14,%xmm14
+ vpxor 48(%rsi),%xmm7,%xmm7
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ leaq 128(%rdi),%rdi
+ vmovdqu %xmm11,0(%rdi)
+ vmovdqu %xmm3,16(%rdi)
+ vmovdqu %xmm14,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ je .Ldone4xop
+
+ leaq 64(%rsi),%rsi
+ vmovdqa %xmm8,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm0,16(%rsp)
+ leaq 64(%rdi),%rdi
+ vmovdqa %xmm12,32(%rsp)
+ subq $192,%rdx
+ vmovdqa %xmm4,48(%rsp)
+
+.Loop_tail4xop:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4xop
+
+.Ldone4xop:
+ vzeroupper
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L4xop_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_4xop,.-ChaCha20_4xop
+.type ChaCha20_8x,@function
+.align 32
+ChaCha20_8x:
+.cfi_startproc
+.LChaCha20_8x:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_8x,.-ChaCha20_8x
diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
index c69b4d978f39..df18fa496de4 100644
--- a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
@@ -2790,6 +2790,10 @@ ecp_nistz256_neg:
.align 32
ecp_nistz256_ord_mul_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_mul_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3118,6 +3122,10 @@ ecp_nistz256_ord_mul_mont:
.align 32
ecp_nistz256_ord_sqr_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_sqr_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3405,6 +3413,462 @@ ecp_nistz256_ord_sqr_mont:
.cfi_endproc
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_mul_montx,@function
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq .Lord-128(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,@function
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq .Lord(%rip),%rsi
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz .Loop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
@@ -3413,6 +3877,8 @@ ecp_nistz256_ord_sqr_mont:
.align 32
ecp_nistz256_to_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
leaq .LRR(%rip),%rdx
jmp .Lmul_mont
.cfi_endproc
@@ -3429,6 +3895,8 @@ ecp_nistz256_to_mont:
.align 32
ecp_nistz256_mul_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
.Lmul_mont:
pushq %rbp
.cfi_adjust_cfa_offset 8
@@ -3449,6 +3917,8 @@ ecp_nistz256_mul_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lmul_body:
+ cmpl $0x80100,%ecx
+ je .Lmul_montx
movq %rdx,%rbx
movq 0(%rdx),%rax
movq 0(%rsi),%r9
@@ -3457,6 +3927,19 @@ ecp_nistz256_mul_mont:
movq 24(%rsi),%r12
call __ecp_nistz256_mul_montq
+ jmp .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_mul_montx
.Lmul_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -3707,6 +4190,8 @@ __ecp_nistz256_mul_montq:
.align 32
ecp_nistz256_sqr_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3726,12 +4211,25 @@ ecp_nistz256_sqr_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lsqr_body:
+ cmpl $0x80100,%ecx
+ je .Lsqr_montx
movq 0(%rsi),%rax
movq 8(%rsi),%r14
movq 16(%rsi),%r15
movq 24(%rsi),%r8
call __ecp_nistz256_sqr_montq
+ jmp .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_sqr_montx
.Lsqr_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -3915,6 +4413,304 @@ __ecp_nistz256_sqr_montq:
.byte 0xf3,0xc3
.cfi_endproc
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type __ecp_nistz256_mul_montx,@function
+.align 32
+__ecp_nistz256_mul_montx:
+.cfi_startproc
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ movq $32,%r14
+ xorq %r13,%r13
+ mulxq %r11,%rbp,%r11
+ movq .Lpoly+24(%rip),%r15
+ adcq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ adcq %rbp,%r10
+ shlxq %r14,%r8,%rbp
+ adcq %rcx,%r11
+ shrxq %r14,%r8,%rcx
+ adcq $0,%r12
+
+
+
+ addq %rbp,%r9
+ adcq %rcx,%r10
+
+ mulxq %r15,%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcq %rcx,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ adcxq %rcx,%r12
+ shlxq %r14,%r9,%rcx
+ adoxq %rbp,%r13
+ shrxq %r14,%r9,%rbp
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+
+ addq %rcx,%r10
+ adcq %rbp,%r11
+
+ mulxq %r15,%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcq %rcx,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ adcxq %rcx,%r13
+ shlxq %r14,%r10,%rcx
+ adoxq %rbp,%r8
+ shrxq %r14,%r10,%rbp
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+
+ addq %rcx,%r11
+ adcq %rbp,%r12
+
+ mulxq %r15,%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcq %rcx,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ adcxq %rcx,%r8
+ shlxq %r14,%r11,%rcx
+ adoxq %rbp,%r9
+ shrxq %r14,%r11,%rbp
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+
+ addq %rcx,%r12
+ adcq %rbp,%r13
+
+ mulxq %r15,%rcx,%rbp
+ movq %r12,%rbx
+ movq .Lpoly+8(%rip),%r14
+ adcq %rcx,%r8
+ movq %r13,%rdx
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+
+ xorl %eax,%eax
+ movq %r8,%rcx
+ sbbq $-1,%r12
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rbp
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rbp,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type __ecp_nistz256_sqr_montx,@function
+.align 32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ xorl %eax,%eax
+ adcq %rcx,%r10
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+
+ mulxq %r8,%rcx,%r14
+ movq 0+128(%rsi),%rdx
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+ mulxq %rdx,%r8,%rbp
+ movq 8+128(%rsi),%rdx
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+ movq 16+128(%rsi),%rdx
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+.byte 0x67
+ mulxq %rdx,%rcx,%rbp
+ movq 24+128(%rsi),%rdx
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ movq $32,%rsi
+ adoxq %rbp,%r13
+.byte 0x67,0x67
+ mulxq %rdx,%rcx,%rax
+ movq .Lpoly+24(%rip),%rdx
+ adoxq %rcx,%r14
+ shlxq %rsi,%r8,%rcx
+ adoxq %rax,%r15
+ shrxq %rsi,%r8,%rax
+ movq %rdx,%rbp
+
+
+ addq %rcx,%r9
+ adcq %rax,%r10
+
+ mulxq %r8,%rcx,%r8
+ adcq %rcx,%r11
+ shlxq %rsi,%r9,%rcx
+ adcq $0,%r8
+ shrxq %rsi,%r9,%rax
+
+
+ addq %rcx,%r10
+ adcq %rax,%r11
+
+ mulxq %r9,%rcx,%r9
+ adcq %rcx,%r8
+ shlxq %rsi,%r10,%rcx
+ adcq $0,%r9
+ shrxq %rsi,%r10,%rax
+
+
+ addq %rcx,%r11
+ adcq %rax,%r8
+
+ mulxq %r10,%rcx,%r10
+ adcq %rcx,%r9
+ shlxq %rsi,%r11,%rcx
+ adcq $0,%r10
+ shrxq %rsi,%r11,%rax
+
+
+ addq %rcx,%r8
+ adcq %rax,%r9
+
+ mulxq %r11,%rcx,%r11
+ adcq %rcx,%r10
+ adcq $0,%r11
+
+ xorq %rdx,%rdx
+ addq %r8,%r12
+ movq .Lpoly+8(%rip),%rsi
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %r11,%r15
+ movq %r13,%r9
+ adcq $0,%rdx
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%r11
+ sbbq %rbp,%r15
+ sbbq $0,%rdx
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %r11,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
@@ -4056,6 +4852,9 @@ ecp_nistz256_scatter_w5:
.align 32
ecp_nistz256_gather_w5:
.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ testl $32,%eax
+ jnz .Lavx2_gather_w5
movdqa .LOne(%rip),%xmm0
movd %edx,%xmm1
@@ -4139,6 +4938,9 @@ ecp_nistz256_scatter_w7:
.align 32
ecp_nistz256_gather_w7:
.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ testl $32,%eax
+ jnz .Lavx2_gather_w7
movdqa .LOne(%rip),%xmm8
movd %edx,%xmm1
@@ -4182,14 +4984,148 @@ ecp_nistz256_gather_w7:
.cfi_endproc
.LSEH_end_ecp_nistz256_gather_w7:
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
+
+
+.type ecp_nistz256_avx2_gather_w5,@function
+.align 32
+ecp_nistz256_avx2_gather_w5:
+.cfi_startproc
+.Lavx2_gather_w5:
+ vzeroupper
+ vmovdqa .LTwo(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+ vpxor %ymm4,%ymm4,%ymm4
+
+ vmovdqa .LOne(%rip),%ymm5
+ vmovdqa .LTwo(%rip),%ymm10
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+ movq $8,%rax
+.Lselect_loop_avx2_w5:
+
+ vmovdqa 0(%rsi),%ymm6
+ vmovdqa 32(%rsi),%ymm7
+ vmovdqa 64(%rsi),%ymm8
+
+ vmovdqa 96(%rsi),%ymm11
+ vmovdqa 128(%rsi),%ymm12
+ vmovdqa 160(%rsi),%ymm13
+
+ vpcmpeqd %ymm1,%ymm5,%ymm9
+ vpcmpeqd %ymm1,%ymm10,%ymm14
+
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm10,%ymm10
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm9,%ymm6,%ymm6
+ vpand %ymm9,%ymm7,%ymm7
+ vpand %ymm9,%ymm8,%ymm8
+ vpand %ymm14,%ymm11,%ymm11
+ vpand %ymm14,%ymm12,%ymm12
+ vpand %ymm14,%ymm13,%ymm13
+
+ vpxor %ymm6,%ymm2,%ymm2
+ vpxor %ymm7,%ymm3,%ymm3
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm11,%ymm2,%ymm2
+ vpxor %ymm12,%ymm3,%ymm3
+ vpxor %ymm13,%ymm4,%ymm4
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w5
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_gather_w5:
+.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
+
+
+
.globl ecp_nistz256_avx2_gather_w7
.type ecp_nistz256_avx2_gather_w7,@function
.align 32
ecp_nistz256_avx2_gather_w7:
.cfi_startproc
-.byte 0x0f,0x0b
+.Lavx2_gather_w7:
+ vzeroupper
+ vmovdqa .LThree(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+
+ vmovdqa .LOne(%rip),%ymm4
+ vmovdqa .LTwo(%rip),%ymm8
+ vmovdqa .LThree(%rip),%ymm12
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+
+ movq $21,%rax
+.Lselect_loop_avx2_w7:
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vmovdqa 64(%rsi),%ymm9
+ vmovdqa 96(%rsi),%ymm10
+
+ vmovdqa 128(%rsi),%ymm13
+ vmovdqa 160(%rsi),%ymm14
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+ vpcmpeqd %ymm1,%ymm8,%ymm11
+ vpcmpeqd %ymm1,%ymm12,%ymm15
+
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpaddd %ymm0,%ymm12,%ymm12
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm11,%ymm9,%ymm9
+ vpand %ymm11,%ymm10,%ymm10
+ vpand %ymm15,%ymm13,%ymm13
+ vpand %ymm15,%ymm14,%ymm14
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+ vpxor %ymm9,%ymm2,%ymm2
+ vpxor %ymm10,%ymm3,%ymm3
+ vpxor %ymm13,%ymm2,%ymm2
+ vpxor %ymm14,%ymm3,%ymm3
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w7
+
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_gather_w7:
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
.type __ecp_nistz256_add_toq,@function
.align 32
@@ -4325,6 +5261,10 @@ __ecp_nistz256_mul_by_2q:
.align 32
ecp_nistz256_point_double:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_doublex
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -4553,6 +5493,10 @@ ecp_nistz256_point_double:
.align 32
ecp_nistz256_point_add:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_addx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -4967,6 +5911,10 @@ ecp_nistz256_point_add:
.align 32
ecp_nistz256_point_add_affine:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_add_affinex
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -5290,3 +6238,1108 @@ ecp_nistz256_point_add_affine:
.byte 0xf3,0xc3
.cfi_endproc
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type __ecp_nistz256_add_tox,@function
+.align 32
+__ecp_nistz256_add_tox:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type __ecp_nistz256_sub_fromx,@function
+.align 32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq $0,%r11
+
+ xorq %r10,%r10
+ adcq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+
+ btq $0,%r11
+ cmovncq %rax,%r12
+ cmovncq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovncq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovncq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type __ecp_nistz256_subx,@function
+.align 32
+__ecp_nistz256_subx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq $0,%r11
+
+ xorq %r9,%r9
+ adcq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+
+ btq $0,%r11
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ cmovcq %rcx,%r8
+ cmovcq %r10,%r9
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type __ecp_nistz256_mul_by_2x,@function
+.align 32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type ecp_nistz256_point_doublex,@function
+.align 32
+ecp_nistz256_point_doublex:
+.cfi_startproc
+.Lpoint_doublex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-128(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 32(%rbx),%rdx
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rdx
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 0+32(%rsp),%rdx
+ movq 8+32(%rsp),%r14
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subx
+
+ movq 32(%rsp),%rdx
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-128(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doublex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type ecp_nistz256_point_addx,@function
+.align 32
+ecp_nistz256_point_addx:
+.cfi_startproc
+.Lpoint_addx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addx_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-128(%rsi),%rsi
+ movq %rdx,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rdx
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 416(%rsp),%rdx
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 512(%rsp),%rdx
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq -128+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 480(%rsp),%rdx
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+
+ orq %r8,%r12
+ orq %r9,%r12
+
+
+.byte 0x3e
+ jnz .Ladd_proceedx
+
+.Ladd_doublex:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+.cfi_adjust_cfa_offset -416
+ jmp .Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset 416
+
+.align 32
+.Ladd_proceedx:
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq -128+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0(%rsp),%rdx
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 160(%rsp),%rdx
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_donex:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type ecp_nistz256_point_add_affinex,@function
+.align 32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc
+.Lpoint_add_affinex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affinex_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-128(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rdx
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-128(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+96(%rsp),%rdx
+ movq 8+96(%rsp),%r14
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq -128+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rdx
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq -128+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affinex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S
index 55ad7db1f240..078353528d5f 100644
--- a/secure/lib/libcrypto/amd64/ghash-x86_64.S
+++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S
@@ -1304,7 +1304,108 @@ gcm_ghash_clmul:
.align 32
gcm_init_avx:
.cfi_startproc
- jmp .L_init_clmul
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
.cfi_endproc
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
@@ -1320,7 +1421,377 @@ gcm_gmult_avx:
.align 32
gcm_ghash_avx:
.cfi_startproc
- jmp .L_ghash_clmul
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
.cfi_endproc
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
diff --git a/secure/lib/libcrypto/amd64/poly1305-x86_64.S b/secure/lib/libcrypto/amd64/poly1305-x86_64.S
index d74ee9b45052..c5a1f45fc5de 100644
--- a/secure/lib/libcrypto/amd64/poly1305-x86_64.S
+++ b/secure/lib/libcrypto/amd64/poly1305-x86_64.S
@@ -25,6 +25,15 @@ poly1305_init:
leaq poly1305_blocks(%rip),%r10
leaq poly1305_emit(%rip),%r11
+ movq OPENSSL_ia32cap_P+4(%rip),%r9
+ leaq poly1305_blocks_avx(%rip),%rax
+ leaq poly1305_emit_avx(%rip),%rcx
+ btq $28,%r9
+ cmovcq %rax,%r10
+ cmovcq %rcx,%r11
+ leaq poly1305_blocks_avx2(%rip),%rax
+ btq $37,%r9
+ cmovcq %rax,%r10
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
@@ -180,6 +189,1782 @@ poly1305_emit:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_emit,.-poly1305_emit
+.type __poly1305_block,@function
+.align 32
+__poly1305_block:
+.cfi_startproc
+ mulq %r14
+ movq %rax,%r9
+ movq %r11,%rax
+ movq %rdx,%r10
+
+ mulq %r14
+ movq %rax,%r14
+ movq %r11,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq %r13,%rax
+ adcq %rdx,%r10
+
+ mulq %rbx
+ movq %rbp,%rbx
+ addq %rax,%r14
+ adcq %rdx,%r8
+
+ imulq %r13,%rbx
+ addq %rbx,%r9
+ movq %r8,%rbx
+ adcq $0,%r10
+
+ imulq %r11,%rbp
+ addq %r9,%rbx
+ movq $-4,%rax
+ adcq %rbp,%r10
+
+ andq %r10,%rax
+ movq %r10,%rbp
+ shrq $2,%r10
+ andq $3,%rbp
+ addq %r10,%rax
+ addq %rax,%r14
+ adcq $0,%rbx
+ adcq $0,%rbp
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __poly1305_block,.-__poly1305_block
+
+.type __poly1305_init_avx,@function
+.align 32
+__poly1305_init_avx:
+.cfi_startproc
+ movq %r11,%r14
+ movq %r12,%rbx
+ xorq %rbp,%rbp
+
+ leaq 48+64(%rdi),%rdi
+
+ movq %r12,%rax
+ call __poly1305_block
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ movq %r14,%r8
+ andl %r14d,%eax
+ movq %r11,%r9
+ andl %r11d,%edx
+ movl %eax,-64(%rdi)
+ shrq $26,%r8
+ movl %edx,-60(%rdi)
+ shrq $26,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,-48(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-44(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,-32(%rdi)
+ shrq $26,%r8
+ movl %edx,-28(%rdi)
+ shrq $26,%r9
+
+ movq %rbx,%rax
+ movq %r12,%rdx
+ shlq $12,%rax
+ shlq $12,%rdx
+ orq %r8,%rax
+ orq %r9,%rdx
+ andl $0x3ffffff,%eax
+ andl $0x3ffffff,%edx
+ movl %eax,-16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-12(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,0(%rdi)
+ movq %rbx,%r8
+ movl %edx,4(%rdi)
+ movq %r12,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ shrq $14,%r9
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,20(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,32(%rdi)
+ shrq $26,%r8
+ movl %edx,36(%rdi)
+ shrq $26,%r9
+
+ movq %rbp,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,48(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r9d,52(%rdi)
+ leaq (%r9,%r9,4),%r9
+ movl %r8d,64(%rdi)
+ movl %r9d,68(%rdi)
+
+ movq %r12,%rax
+ call __poly1305_block
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-52(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-36(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-20(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-4(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,12(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,28(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,44(%rdi)
+
+ movq %rbp,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,60(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,76(%rdi)
+
+ movq %r12,%rax
+ call __poly1305_block
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-56(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-40(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-24(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-8(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,8(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,24(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,40(%rdi)
+
+ movq %rbp,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,56(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,72(%rdi)
+
+ leaq -48-64(%rdi),%rdi
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __poly1305_init_avx,.-__poly1305_init_avx
+
+.type poly1305_blocks_avx,@function
+.align 32
+poly1305_blocks_avx:
+.cfi_startproc
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ andq $-16,%rdx
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ testq $31,%rdx
+ jz .Leven_avx
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lblocks_avx_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%ebp
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %rbp,%r8
+ shlq $40,%r8
+ shrq $24,%rbp
+ addq %r8,%rbx
+ adcq $0,%rbp
+
+ movq $-4,%r9
+ movq %rbp,%r8
+ andq %rbp,%r9
+ shrq $2,%r8
+ andq $3,%rbp
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%rbp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+
+ call __poly1305_block
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%rbp
+
+ subq $16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rbp,16(%rdi)
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %ebp,16(%rdi)
+.align 16
+.Ldone_avx:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx:
+.cfi_startproc
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lbase2_64_avx_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%ebp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $31,%rdx
+ jz .Linit_avx
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+ subq $16,%r15
+
+ call __poly1305_block
+
+.Linit_avx:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%rbp
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ movl $1,20(%rdi)
+
+ call __poly1305_init_avx
+
+.Lproceed_avx:
+ movq %r15,%rdx
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+.cfi_endproc
+
+.align 32
+.Leven_avx:
+.cfi_startproc
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx:
+ leaq -88(%rsp),%r11
+.cfi_def_cfa %r11,0x60
+ subq $0x178,%rsp
+ subq $64,%rdx
+ leaq -32(%rsi),%rax
+ cmovcq %rax,%rsi
+
+ vmovdqu 48(%rdi),%xmm14
+ leaq 112(%rdi),%rdi
+ leaq .Lconst(%rip),%rcx
+
+
+
+ vmovdqu 32(%rsi),%xmm5
+ vmovdqu 48(%rsi),%xmm6
+ vmovdqa 64(%rcx),%xmm15
+
+ vpsrldq $6,%xmm5,%xmm7
+ vpsrldq $6,%xmm6,%xmm8
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+ vpsrlq $40,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+ jbe .Lskip_loop_avx
+
+
+ vmovdqu -48(%rdi),%xmm11
+ vmovdqu -32(%rdi),%xmm12
+ vpshufd $0xEE,%xmm14,%xmm13
+ vpshufd $0x44,%xmm14,%xmm10
+ vmovdqa %xmm13,-144(%r11)
+ vmovdqa %xmm10,0(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vmovdqu -16(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-128(%r11)
+ vmovdqa %xmm11,16(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqu 0(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-112(%r11)
+ vmovdqa %xmm12,32(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm14
+ vmovdqu 16(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm14,-96(%r11)
+ vmovdqa %xmm10,48(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm13
+ vmovdqu 32(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm13,-80(%r11)
+ vmovdqa %xmm11,64(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm14
+ vmovdqu 48(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm14,-64(%r11)
+ vmovdqa %xmm12,80(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm13
+ vmovdqu 64(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm13,-48(%r11)
+ vmovdqa %xmm10,96(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-32(%r11)
+ vmovdqa %xmm11,112(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqa 0(%rsp),%xmm14
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-16(%r11)
+ vmovdqa %xmm12,128(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vmovdqa %xmm2,32(%r11)
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vmovdqa 16(%rsp),%xmm2
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vmovdqa %xmm0,0(%r11)
+ vpmuludq 32(%rsp),%xmm9,%xmm0
+ vmovdqa %xmm1,16(%r11)
+ vpmuludq %xmm8,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm10,%xmm10
+ vpaddq %xmm1,%xmm14,%xmm14
+ vmovdqa %xmm3,48(%r11)
+ vpmuludq %xmm7,%xmm2,%xmm0
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm13,%xmm13
+ vmovdqa 48(%rsp),%xmm3
+ vpaddq %xmm1,%xmm12,%xmm12
+ vmovdqa %xmm4,64(%r11)
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpmuludq %xmm7,%xmm3,%xmm0
+ vpaddq %xmm2,%xmm11,%xmm11
+
+ vmovdqa 64(%rsp),%xmm4
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm3,%xmm1
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm1,%xmm13,%xmm13
+ vmovdqa 80(%rsp),%xmm2
+ vpaddq %xmm3,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm4,%xmm0
+ vpmuludq %xmm8,%xmm4,%xmm4
+ vpaddq %xmm0,%xmm11,%xmm11
+ vmovdqa 96(%rsp),%xmm3
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vmovdqa 128(%rsp),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpaddq %xmm2,%xmm13,%xmm13
+ vpmuludq %xmm9,%xmm3,%xmm0
+ vpmuludq %xmm8,%xmm3,%xmm1
+ vpaddq %xmm0,%xmm12,%xmm12
+ vmovdqu 0(%rsi),%xmm0
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm3,%xmm3
+ vpmuludq %xmm7,%xmm4,%xmm7
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vmovdqu 16(%rsi),%xmm1
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm8,%xmm4,%xmm8
+ vpmuludq %xmm9,%xmm4,%xmm9
+ vpsrldq $6,%xmm0,%xmm2
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm9,%xmm13,%xmm13
+ vpsrldq $6,%xmm1,%xmm3
+ vpmuludq 112(%rsp),%xmm5,%xmm9
+ vpmuludq %xmm6,%xmm4,%xmm5
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpaddq %xmm9,%xmm14,%xmm14
+ vmovdqa -144(%r11),%xmm9
+ vpaddq %xmm5,%xmm10,%xmm10
+
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+
+ vpsrldq $5,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpand 0(%rcx),%xmm4,%xmm4
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+ leaq 32(%rsi),%rax
+ leaq 64(%rsi),%rsi
+ subq $64,%rdx
+ cmovcq %rax,%rsi
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vmovdqa -128(%r11),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpmuludq -112(%r11),%xmm4,%xmm5
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -96(%r11),%xmm8
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpaddq %xmm7,%xmm11,%xmm11
+
+ vmovdqa -80(%r11),%xmm9
+ vpmuludq %xmm2,%xmm8,%xmm5
+ vpmuludq %xmm1,%xmm8,%xmm6
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -64(%r11),%xmm7
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpmuludq %xmm4,%xmm9,%xmm5
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm5,%xmm11,%xmm11
+ vmovdqa -48(%r11),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vmovdqa -16(%r11),%xmm9
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpmuludq %xmm4,%xmm8,%xmm5
+ vpaddq %xmm7,%xmm13,%xmm13
+ vpaddq %xmm5,%xmm12,%xmm12
+ vmovdqu 32(%rsi),%xmm5
+ vpmuludq %xmm3,%xmm8,%xmm7
+ vpmuludq %xmm2,%xmm8,%xmm8
+ vpaddq %xmm7,%xmm11,%xmm11
+ vmovdqu 48(%rsi),%xmm6
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpmuludq %xmm2,%xmm9,%xmm2
+ vpmuludq %xmm3,%xmm9,%xmm3
+ vpsrldq $6,%xmm5,%xmm7
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm9,%xmm4
+ vpsrldq $6,%xmm6,%xmm8
+ vpaddq %xmm3,%xmm12,%xmm2
+ vpaddq %xmm4,%xmm13,%xmm3
+ vpmuludq -32(%r11),%xmm0,%xmm4
+ vpmuludq %xmm1,%xmm9,%xmm0
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpaddq %xmm4,%xmm14,%xmm4
+ vpaddq %xmm0,%xmm10,%xmm0
+
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+
+ vpsrldq $5,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vmovdqa 0(%rsp),%xmm14
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpand 0(%rcx),%xmm9,%xmm9
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+
+
+
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm11,%xmm1
+
+ vpsrlq $26,%xmm4,%xmm10
+ vpand %xmm15,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm1,%xmm11
+ vpand %xmm15,%xmm1,%xmm1
+ vpaddq %xmm11,%xmm2,%xmm2
+
+ vpaddq %xmm10,%xmm0,%xmm0
+ vpsllq $2,%xmm10,%xmm10
+ vpaddq %xmm10,%xmm0,%xmm0
+
+ vpsrlq $26,%xmm2,%xmm12
+ vpand %xmm15,%xmm2,%xmm2
+ vpaddq %xmm12,%xmm3,%xmm3
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm1,%xmm1
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+
+
+
+ vpshufd $0x10,%xmm14,%xmm14
+ addq $32,%rdx
+ jnz .Long_tail_avx
+
+ vpaddq %xmm2,%xmm7,%xmm7
+ vpaddq %xmm0,%xmm5,%xmm5
+ vpaddq %xmm1,%xmm6,%xmm6
+ vpaddq %xmm3,%xmm8,%xmm8
+ vpaddq %xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+ vmovdqa %xmm2,32(%r11)
+ vmovdqa %xmm0,0(%r11)
+ vmovdqa %xmm1,16(%r11)
+ vmovdqa %xmm3,48(%r11)
+ vmovdqa %xmm4,64(%r11)
+
+
+
+
+
+
+
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpshufd $0x10,-48(%rdi),%xmm2
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm8,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpshufd $0x10,-32(%rdi),%xmm3
+ vpmuludq %xmm7,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpshufd $0x10,-16(%rdi),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm9,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vpshufd $0x10,0(%rdi),%xmm2
+ vpmuludq %xmm7,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm13,%xmm13
+ vpshufd $0x10,16(%rdi),%xmm3
+ vpmuludq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpshufd $0x10,32(%rdi),%xmm4
+ vpmuludq %xmm8,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm10,%xmm10
+
+ vpmuludq %xmm6,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm13,%xmm13
+ vpshufd $0x10,48(%rdi),%xmm2
+ vpmuludq %xmm9,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm12,%xmm12
+ vpshufd $0x10,64(%rdi),%xmm3
+ vpmuludq %xmm8,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm14,%xmm14
+ vpmuludq %xmm9,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpmuludq %xmm8,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm7,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm6,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+
+ vpsrldq $6,%xmm0,%xmm2
+ vpsrldq $6,%xmm1,%xmm3
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+ vpsrlq $40,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpshufd $0x32,-64(%rdi),%xmm9
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+
+
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpshufd $0x32,-48(%rdi),%xmm7
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpshufd $0x32,-32(%rdi),%xmm8
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpshufd $0x32,-16(%rdi),%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpshufd $0x32,0(%rdi),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm13,%xmm13
+ vpshufd $0x32,16(%rdi),%xmm8
+ vpmuludq %xmm0,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm12,%xmm12
+ vpmuludq %xmm4,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpshufd $0x32,32(%rdi),%xmm9
+ vpmuludq %xmm3,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm10,%xmm10
+
+ vpmuludq %xmm1,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpshufd $0x32,48(%rdi),%xmm7
+ vpmuludq %xmm4,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpshufd $0x32,64(%rdi),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm14,%xmm14
+ vpmuludq %xmm4,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm3,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm2,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm1,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+
+
+ vpsrldq $8,%xmm14,%xmm9
+ vpsrldq $8,%xmm13,%xmm8
+ vpsrldq $8,%xmm11,%xmm6
+ vpsrldq $8,%xmm10,%xmm5
+ vpsrldq $8,%xmm12,%xmm7
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpaddq %xmm9,%xmm14,%xmm14
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpaddq %xmm7,%xmm12,%xmm12
+
+
+
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm14,%xmm4
+ vpand %xmm15,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm11,%xmm1
+ vpand %xmm15,%xmm11,%xmm11
+ vpaddq %xmm1,%xmm12,%xmm12
+
+ vpaddq %xmm4,%xmm10,%xmm10
+ vpsllq $2,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpsrlq $26,%xmm12,%xmm2
+ vpand %xmm15,%xmm12,%xmm12
+ vpaddq %xmm2,%xmm13,%xmm13
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vmovd %xmm10,-112(%rdi)
+ vmovd %xmm11,-108(%rdi)
+ vmovd %xmm12,-104(%rdi)
+ vmovd %xmm13,-100(%rdi)
+ vmovd %xmm14,-96(%rdi)
+ leaq 88(%r11),%rsp
+.cfi_def_cfa %rsp,8
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type poly1305_emit_avx,@function
+.align 32
+poly1305_emit_avx:
+.cfi_startproc
+ cmpl $0,20(%rdi)
+ je .Lemit
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ecx
+ movl 8(%rdi),%r8d
+ movl 12(%rdi),%r11d
+ movl 16(%rdi),%r10d
+
+ shlq $26,%rcx
+ movq %r8,%r9
+ shlq $52,%r8
+ addq %rcx,%rax
+ shrq $12,%r9
+ addq %rax,%r8
+ adcq $0,%r9
+
+ shlq $14,%r11
+ movq %r10,%rax
+ shrq $24,%r10
+ addq %r11,%r9
+ shlq $40,%rax
+ addq %rax,%r9
+ adcq $0,%r10
+
+ movq %r10,%rax
+ movq %r10,%rcx
+ andq $3,%r10
+ shrq $2,%rax
+ andq $-4,%rcx
+ addq %rcx,%rax
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly1305_emit_avx,.-poly1305_emit_avx
+.type poly1305_blocks_avx2,@function
+.align 32
+poly1305_blocks_avx2:
+.cfi_startproc
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2:
+ andq $-16,%rdx
+ jz .Lno_data_avx2
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2
+
+ testq $63,%rdx
+ jz .Leven_avx2
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lblocks_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%ebp
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %rbp,%r8
+ shlq $40,%r8
+ shrq $24,%rbp
+ addq %r8,%rbx
+ adcq $0,%rbp
+
+ movq $-4,%r9
+ movq %rbp,%r8
+ andq %rbp,%r9
+ shrq $2,%r8
+ andq $3,%rbp
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%rbp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+ subq $16,%r15
+
+ call __poly1305_block
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%rbp
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ jmp .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rbp,16(%rdi)
+ jmp .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %ebp,16(%rdi)
+.align 16
+.Ldone_avx2:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx2:
+.cfi_startproc
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lbase2_64_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%ebp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+ subq $16,%r15
+
+ call __poly1305_block
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%rbp
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ movl $1,20(%rdi)
+
+ call __poly1305_init_avx
+
+.Lproceed_avx2:
+ movq %r15,%rdx
+ movl OPENSSL_ia32cap_P+8(%rip),%r10d
+ movl $3221291008,%r11d
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lbase2_64_avx2_epilogue:
+ jmp .Ldo_avx2
+.cfi_endproc
+
+.align 32
+.Leven_avx2:
+.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%r10d
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2:
+ leaq -8(%rsp),%r11
+.cfi_def_cfa %r11,16
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+
+
+
+
+
+
+
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+
+
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2
+
+.byte 0x66,0x90
+.Ltail_avx2:
+
+
+
+
+
+
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+
+
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+
+
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq 8(%r11),%rsp
+.cfi_def_cfa %rsp,8
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
+.align 64
+.Lconst:
+.Lmask24:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.Lmask26:
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long 2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad 0,12,24,64
+.L2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad 44,44,42,64
+.L2_44_shift_lft:
+.quad 8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
.globl xor128_encrypt_n_pad
diff --git a/secure/lib/libcrypto/amd64/rsaz-avx2.S b/secure/lib/libcrypto/amd64/rsaz-avx2.S
index e957915a7d81..3075a52d2eec 100644
--- a/secure/lib/libcrypto/amd64/rsaz-avx2.S
+++ b/secure/lib/libcrypto/amd64/rsaz-avx2.S
@@ -2,26 +2,1745 @@
/* Do not modify. This file is auto-generated from rsaz-avx2.pl. */
.text
-.globl rsaz_avx2_eligible
-.type rsaz_avx2_eligible,@function
-rsaz_avx2_eligible:
- xorl %eax,%eax
- .byte 0xf3,0xc3
-.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
-
.globl rsaz_1024_sqr_avx2
-.globl rsaz_1024_mul_avx2
-.globl rsaz_1024_norm2red_avx2
-.globl rsaz_1024_red2norm_avx2
-.globl rsaz_1024_scatter5_avx2
-.globl rsaz_1024_gather5_avx2
.type rsaz_1024_sqr_avx2,@function
+.align 64
rsaz_1024_sqr_avx2:
+.cfi_startproc
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ movq %rdx,%r13
+ subq $832,%rsp
+ movq %r13,%r15
+ subq $-128,%rdi
+ subq $-128,%rsi
+ subq $-128,%r13
+
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ vpxor %ymm9,%ymm9,%ymm9
+ jz .Lsqr_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%r13),%ymm0
+ andq $-2048,%rsp
+ vmovdqu 32-128(%r13),%ymm1
+ vmovdqu 64-128(%r13),%ymm2
+ vmovdqu 96-128(%r13),%ymm3
+ vmovdqu 128-128(%r13),%ymm4
+ vmovdqu 160-128(%r13),%ymm5
+ vmovdqu 192-128(%r13),%ymm6
+ vmovdqu 224-128(%r13),%ymm7
+ vmovdqu 256-128(%r13),%ymm8
+ leaq 832+128(%rsp),%r13
+ vmovdqu %ymm0,0-128(%r13)
+ vmovdqu %ymm1,32-128(%r13)
+ vmovdqu %ymm2,64-128(%r13)
+ vmovdqu %ymm3,96-128(%r13)
+ vmovdqu %ymm4,128-128(%r13)
+ vmovdqu %ymm5,160-128(%r13)
+ vmovdqu %ymm6,192-128(%r13)
+ vmovdqu %ymm7,224-128(%r13)
+ vmovdqu %ymm8,256-128(%r13)
+ vmovdqu %ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+ andq $-1024,%rsp
+
+ vmovdqu 32-128(%rsi),%ymm1
+ vmovdqu 64-128(%rsi),%ymm2
+ vmovdqu 96-128(%rsi),%ymm3
+ vmovdqu 128-128(%rsi),%ymm4
+ vmovdqu 160-128(%rsi),%ymm5
+ vmovdqu 192-128(%rsi),%ymm6
+ vmovdqu 224-128(%rsi),%ymm7
+ vmovdqu 256-128(%rsi),%ymm8
+
+ leaq 192(%rsp),%rbx
+ vmovdqu .Land_mask(%rip),%ymm15
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ leaq 576+128(%rsp),%r9
+ leaq 448(%rsp),%r12
+
+
+
+
+ vpaddq %ymm1,%ymm1,%ymm1
+ vpbroadcastq 0-128(%rsi),%ymm10
+ vpaddq %ymm2,%ymm2,%ymm2
+ vmovdqa %ymm1,0-128(%r9)
+ vpaddq %ymm3,%ymm3,%ymm3
+ vmovdqa %ymm2,32-128(%r9)
+ vpaddq %ymm4,%ymm4,%ymm4
+ vmovdqa %ymm3,64-128(%r9)
+ vpaddq %ymm5,%ymm5,%ymm5
+ vmovdqa %ymm4,96-128(%r9)
+ vpaddq %ymm6,%ymm6,%ymm6
+ vmovdqa %ymm5,128-128(%r9)
+ vpaddq %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm6,160-128(%r9)
+ vpaddq %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm7,192-128(%r9)
+ vpxor %ymm9,%ymm9,%ymm9
+ vmovdqa %ymm8,224-128(%r9)
+
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpbroadcastq 32-128(%rsi),%ymm11
+ vmovdqu %ymm9,288-192(%rbx)
+ vpmuludq %ymm10,%ymm1,%ymm1
+ vmovdqu %ymm9,320-448(%r12)
+ vpmuludq %ymm10,%ymm2,%ymm2
+ vmovdqu %ymm9,352-448(%r12)
+ vpmuludq %ymm10,%ymm3,%ymm3
+ vmovdqu %ymm9,384-448(%r12)
+ vpmuludq %ymm10,%ymm4,%ymm4
+ vmovdqu %ymm9,416-448(%r12)
+ vpmuludq %ymm10,%ymm5,%ymm5
+ vmovdqu %ymm9,448-448(%r12)
+ vpmuludq %ymm10,%ymm6,%ymm6
+ vmovdqu %ymm9,480-448(%r12)
+ vpmuludq %ymm10,%ymm7,%ymm7
+ vmovdqu %ymm9,512-448(%r12)
+ vpmuludq %ymm10,%ymm8,%ymm8
+ vpbroadcastq 64-128(%rsi),%ymm10
+ vmovdqu %ymm9,544-448(%r12)
+
+ movq %rsi,%r15
+ movl $4,%r14d
+ jmp .Lsqr_entry_1024
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32-128(%r15),%ymm11
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpaddq 0-192(%rbx),%ymm0,%ymm0
+ vpmuludq 0-128(%r9),%ymm10,%ymm1
+ vpaddq 32-192(%rbx),%ymm1,%ymm1
+ vpmuludq 32-128(%r9),%ymm10,%ymm2
+ vpaddq 64-192(%rbx),%ymm2,%ymm2
+ vpmuludq 64-128(%r9),%ymm10,%ymm3
+ vpaddq 96-192(%rbx),%ymm3,%ymm3
+ vpmuludq 96-128(%r9),%ymm10,%ymm4
+ vpaddq 128-192(%rbx),%ymm4,%ymm4
+ vpmuludq 128-128(%r9),%ymm10,%ymm5
+ vpaddq 160-192(%rbx),%ymm5,%ymm5
+ vpmuludq 160-128(%r9),%ymm10,%ymm6
+ vpaddq 192-192(%rbx),%ymm6,%ymm6
+ vpmuludq 192-128(%r9),%ymm10,%ymm7
+ vpaddq 224-192(%rbx),%ymm7,%ymm7
+ vpmuludq 224-128(%r9),%ymm10,%ymm8
+ vpbroadcastq 64-128(%r15),%ymm10
+ vpaddq 256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+ vmovdqu %ymm0,0-192(%rbx)
+ vmovdqu %ymm1,32-192(%rbx)
+
+ vpmuludq 32-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 32-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 64-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 96-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 128-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 160-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 192-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 224-128(%r9),%ymm11,%ymm0
+ vpbroadcastq 96-128(%r15),%ymm11
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+
+ vmovdqu %ymm2,64-192(%rbx)
+ vmovdqu %ymm3,96-192(%rbx)
+
+ vpmuludq 64-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 64-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 96-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 128-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 160-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 224-128(%r9),%ymm10,%ymm1
+ vpbroadcastq 128-128(%r15),%ymm10
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+
+ vmovdqu %ymm4,128-192(%rbx)
+ vmovdqu %ymm5,160-192(%rbx)
+
+ vpmuludq 96-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 96-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq 128-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm0,%ymm0
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq 224-128(%r9),%ymm11,%ymm2
+ vpbroadcastq 160-128(%r15),%ymm11
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+
+ vmovdqu %ymm6,192-192(%rbx)
+ vmovdqu %ymm7,224-192(%rbx)
+
+ vpmuludq 128-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 128-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 160-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 192-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 224-128(%r9),%ymm10,%ymm3
+ vpbroadcastq 192-128(%r15),%ymm10
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+
+ vmovdqu %ymm8,256-192(%rbx)
+ vmovdqu %ymm0,288-192(%rbx)
+ leaq 8(%rbx),%rbx
+
+ vpmuludq 160-128(%rsi),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 224-128(%r9),%ymm11,%ymm4
+ vpbroadcastq 224-128(%r15),%ymm11
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+
+ vmovdqu %ymm1,320-448(%r12)
+ vmovdqu %ymm2,352-448(%r12)
+
+ vpmuludq 192-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpbroadcastq 256-128(%r15),%ymm0
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq 224-128(%r9),%ymm10,%ymm5
+ vpbroadcastq 0+8-128(%r15),%ymm10
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+
+ vmovdqu %ymm3,384-448(%r12)
+ vmovdqu %ymm4,416-448(%r12)
+ leaq 8(%r15),%r15
+
+ vpmuludq 224-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 224-128(%r9),%ymm11,%ymm6
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+
+ vpmuludq 256-128(%rsi),%ymm0,%ymm7
+ vmovdqu %ymm5,448-448(%r12)
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vmovdqu %ymm6,480-448(%r12)
+ vmovdqu %ymm7,512-448(%r12)
+ leaq 8(%r12),%r12
+
+ decl %r14d
+ jnz .LOOP_SQR_1024
+
+ vmovdqu 256(%rsp),%ymm8
+ vmovdqu 288(%rsp),%ymm1
+ vmovdqu 320(%rsp),%ymm2
+ leaq 192(%rsp),%rbx
+
+ vpsrlq $29,%ymm8,%ymm14
+ vpand %ymm15,%ymm8,%ymm8
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+
+ vpermq $0x93,%ymm14,%ymm14
+ vpxor %ymm9,%ymm9,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpblendd $3,%ymm11,%ymm9,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,288-192(%rbx)
+ vmovdqu %ymm2,320-192(%rbx)
+
+ movq (%rsp),%rax
+ movq 8(%rsp),%r10
+ movq 16(%rsp),%r11
+ movq 24(%rsp),%r12
+ vmovdqu 32(%rsp),%ymm1
+ vmovdqu 64-192(%rbx),%ymm2
+ vmovdqu 96-192(%rbx),%ymm3
+ vmovdqu 128-192(%rbx),%ymm4
+ vmovdqu 160-192(%rbx),%ymm5
+ vmovdqu 192-192(%rbx),%ymm6
+ vmovdqu 224-192(%rbx),%ymm7
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpbroadcastq %xmm12,%ymm12
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ shrq $29,%r9
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ addq %r9,%r10
+ addq %rax,%r11
+ imulq 24-128(%r13),%rdx
+ addq %rdx,%r12
+
+ movq %r10,%rax
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+
+ movl $9,%r14d
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax,%xmm13
+ vpbroadcastq %xmm13,%ymm13
+
+ vpmuludq 32-128(%r13),%ymm12,%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm10,%ymm1,%ymm1
+ addq %rax,%r10
+ vpmuludq 64-128(%r13),%ymm12,%ymm14
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm14,%ymm2,%ymm2
+ vpmuludq 96-128(%r13),%ymm12,%ymm11
+.byte 0x67
+ addq %rax,%r11
+.byte 0x67
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ shrq $29,%r10
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpmuludq 128-128(%r13),%ymm12,%ymm10
+ addq %rax,%r12
+ addq %r10,%r11
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpmuludq 160-128(%r13),%ymm12,%ymm14
+ movq %r11,%rax
+ imull %ecx,%eax
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpmuludq 192-128(%r13),%ymm12,%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpmuludq 224-128(%r13),%ymm12,%ymm10
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpmuludq 256-128(%r13),%ymm12,%ymm14
+ vmovd %eax,%xmm12
+
+ vpaddq %ymm14,%ymm8,%ymm8
+
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 32-8-128(%r13),%ymm13,%ymm11
+ vmovdqu 96-8-128(%r13),%ymm14
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpmuludq 64-8-128(%r13),%ymm13,%ymm10
+ vmovdqu 128-8-128(%r13),%ymm11
+ addq %rax,%r11
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm10,%ymm2,%ymm2
+ addq %r12,%rax
+ shrq $29,%r11
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 160-8-128(%r13),%ymm10
+ addq %r11,%rax
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 192-8-128(%r13),%ymm14
+.byte 0x67
+ movq %rax,%r12
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpmuludq %ymm13,%ymm10,%ymm10
+.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm5,%ymm5
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 256-8-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 288-8-128(%r13),%ymm9
+ vmovd %eax,%xmm0
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpmuludq %ymm13,%ymm10,%ymm10
+ vmovdqu 32-16-128(%r13),%ymm14
+ vpbroadcastq %xmm0,%ymm0
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpmuludq %ymm13,%ymm9,%ymm9
+ vmovdqu 64-16-128(%r13),%ymm11
+ addq %rax,%r12
+
+ vmovdqu 32-24-128(%r13),%ymm13
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 96-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq %ymm0,%ymm13,%ymm13
+ vpmuludq %ymm12,%ymm11,%ymm11
+.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+ vpaddq %ymm1,%ymm13,%ymm13
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 160-16-128(%r13),%ymm11
+.byte 0x67
+ vmovq %xmm13,%rax
+ vmovdqu %ymm13,(%rsp)
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 192-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq %ymm12,%ymm11,%ymm11
+ vmovdqu 224-16-128(%r13),%ymm14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 256-16-128(%r13),%ymm11
+ vpaddq %ymm10,%ymm6,%ymm6
+ vpmuludq %ymm12,%ymm14,%ymm14
+ shrq $29,%r12
+ vmovdqu 288-16-128(%r13),%ymm10
+ addq %r12,%rax
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq %ymm12,%ymm11,%ymm11
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm8,%ymm8
+ vpmuludq %ymm12,%ymm10,%ymm10
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+ vmovdqu 96-24-128(%r13),%ymm11
+.byte 0x67
+ vpaddq %ymm10,%ymm9,%ymm9
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 64-24-128(%r13),%ymm0,%ymm14
+ vmovdqu 128-24-128(%r13),%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ movq 8(%rsp),%r10
+ vpaddq %ymm14,%ymm2,%ymm1
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 160-24-128(%r13),%ymm14
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+.byte 0x67
+ shrq $29,%r9
+ movq 16(%rsp),%r11
+ vpaddq %ymm11,%ymm3,%ymm2
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vmovdqu 192-24-128(%r13),%ymm11
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ vpaddq %ymm10,%ymm4,%ymm3
+ vpmuludq %ymm0,%ymm14,%ymm14
+ vmovdqu 224-24-128(%r13),%ymm10
+ imulq 24-128(%r13),%rdx
+ addq %rax,%r11
+ leaq (%r9,%r10,1),%rax
+ vpaddq %ymm14,%ymm5,%ymm4
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 256-24-128(%r13),%ymm14
+ movq %rax,%r10
+ imull %ecx,%eax
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vpaddq %ymm11,%ymm6,%ymm5
+ vmovdqu 288-24-128(%r13),%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm7,%ymm6
+ vpmuludq %ymm0,%ymm14,%ymm14
+ addq 24(%rsp),%rdx
+ vpaddq %ymm14,%ymm8,%ymm7
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vpaddq %ymm11,%ymm9,%ymm8
+ vmovq %r12,%xmm9
+ movq %rdx,%r12
+
+ decl %r14d
+ jnz .LOOP_REDUCE_1024
+ leaq 448(%rsp),%r12
+ vpaddq %ymm9,%ymm13,%ymm0
+ vpxor %ymm9,%ymm9,%ymm9
+
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vpaddq 544-448(%r12),%ymm8,%ymm8
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vmovdqu %ymm0,0-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,32-128(%rdi)
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vmovdqu %ymm2,64-128(%rdi)
+ vpaddq %ymm13,%ymm4,%ymm4
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vmovdqu %ymm4,128-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vmovdqu %ymm5,160-128(%rdi)
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vmovdqu %ymm6,192-128(%rdi)
+ vpaddq %ymm13,%ymm8,%ymm8
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+
+ movq %rdi,%rsi
+ decl %r8d
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lsqr_1024_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,@function
+.align 64
rsaz_1024_mul_avx2:
-rsaz_1024_norm2red_avx2:
+.cfi_startproc
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ vzeroall
+ movq %rdx,%r13
+ subq $64,%rsp
+
+
+
+
+
+
+.byte 0x67,0x67
+ movq %rsi,%r15
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ movq %rsi,%r15
+ cmovnzq %r13,%rsi
+ cmovnzq %r15,%r13
+
+ movq %rcx,%r15
+ subq $-128,%rsi
+ subq $-128,%rcx
+ subq $-128,%rdi
+
+ andq $4095,%r15
+ addq $320,%r15
+.byte 0x67,0x67
+ shrq $12,%r15
+ jz .Lmul_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%rcx),%ymm0
+ andq $-512,%rsp
+ vmovdqu 32-128(%rcx),%ymm1
+ vmovdqu 64-128(%rcx),%ymm2
+ vmovdqu 96-128(%rcx),%ymm3
+ vmovdqu 128-128(%rcx),%ymm4
+ vmovdqu 160-128(%rcx),%ymm5
+ vmovdqu 192-128(%rcx),%ymm6
+ vmovdqu 224-128(%rcx),%ymm7
+ vmovdqu 256-128(%rcx),%ymm8
+ leaq 64+128(%rsp),%rcx
+ vmovdqu %ymm0,0-128(%rcx)
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm1,32-128(%rcx)
+ vpxor %ymm1,%ymm1,%ymm1
+ vmovdqu %ymm2,64-128(%rcx)
+ vpxor %ymm2,%ymm2,%ymm2
+ vmovdqu %ymm3,96-128(%rcx)
+ vpxor %ymm3,%ymm3,%ymm3
+ vmovdqu %ymm4,128-128(%rcx)
+ vpxor %ymm4,%ymm4,%ymm4
+ vmovdqu %ymm5,160-128(%rcx)
+ vpxor %ymm5,%ymm5,%ymm5
+ vmovdqu %ymm6,192-128(%rcx)
+ vpxor %ymm6,%ymm6,%ymm6
+ vmovdqu %ymm7,224-128(%rcx)
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqu %ymm8,256-128(%rcx)
+ vmovdqa %ymm0,%ymm8
+ vmovdqu %ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+ andq $-64,%rsp
+
+ movq (%r13),%rbx
+ vpbroadcastq (%r13),%ymm10
+ vmovdqu %ymm0,(%rsp)
+ xorq %r9,%r9
+.byte 0x67
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+
+ vmovdqu .Land_mask(%rip),%ymm15
+ movl $9,%r14d
+ vmovdqu %ymm9,288-128(%rdi)
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq $29,%ymm3,%ymm9
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r9,%rax
+ movq %rbx,%r10
+ imulq 8-128(%rsi),%r10
+ addq 8(%rsp),%r10
+
+ movq %rax,%r9
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ movq %rbx,%r11
+ imulq 16-128(%rsi),%r11
+ addq 16(%rsp),%r11
+
+ movq %rbx,%r12
+ imulq 24-128(%rsi),%r12
+ addq 24(%rsp),%r12
+ vpmuludq 32-128(%rsi),%ymm10,%ymm0
+ vmovd %eax,%xmm11
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq 64-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 96-128(%rsi),%ymm10,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq 128-128(%rsi),%ymm10,%ymm0
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq 160-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 192-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq 224-128(%rsi),%ymm10,%ymm0
+ vpermq $0x93,%ymm9,%ymm9
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq 256-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq 8(%r13),%ymm10
+ vpaddq %ymm12,%ymm8,%ymm8
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%rcx),%rax
+ addq %rax,%r11
+ shrq $29,%r9
+ imulq 24-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r9,%r10
+
+ vpmuludq 32-128(%rcx),%ymm11,%ymm13
+ vmovq %xmm10,%rbx
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 64-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm2,%ymm2
+ vpmuludq 96-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 128-128(%rcx),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 160-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm5,%ymm5
+ vpmuludq 192-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 224-128(%rcx),%ymm11,%ymm13
+ vpblendd $3,%ymm14,%ymm9,%ymm12
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 256-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm0,%ymm8,%ymm8
+
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rsi),%ymm12
+ movq %rbx,%rax
+ imulq 8-128(%rsi),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rsi),%ymm13
+
+ movq %r10,%rax
+ vpblendd $0xfc,%ymm14,%ymm9,%ymm9
+ imull %r8d,%eax
+ vpaddq %ymm9,%ymm4,%ymm4
+ andl $0x1fffffff,%eax
+
+ imulq 16-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovd %eax,%xmm11
+ vmovdqu -8+96-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -8+128-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+160-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+192-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -8+224-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+256-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+288-128(%rsi),%ymm9
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm9,%ymm9
+ vpbroadcastq 16(%r13),%ymm10
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rcx),%ymm0
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rcx),%ymm12
+ shrq $29,%r10
+ imulq 16-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r10,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -8+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rsi),%ymm0
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r11,%rax
+
+ vmovdqu -16+64-128(%rsi),%ymm12
+ movq %rax,%r11
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ imulq 8-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -16+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -16+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 24(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rcx),%ymm0
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -16+64-128(%rcx),%ymm12
+ imulq 8-128(%rcx),%rdx
+ addq %rdx,%r12
+ shrq $29,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -16+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+32-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+64-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ addq %r11,%r12
+ imulq -128(%rsi),%rbx
+ addq %rbx,%r12
+
+ movq %r12,%rax
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -24+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -24+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 32(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+ addq $32,%r13
+
+ vmovdqu -24+32-128(%rcx),%ymm0
+ imulq -128(%rcx),%rax
+ addq %rax,%r12
+ shrq $29,%r12
+
+ vmovdqu -24+64-128(%rcx),%ymm12
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -24+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm0
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu %ymm0,(%rsp)
+ vpaddq %ymm12,%ymm2,%ymm1
+ vmovdqu -24+128-128(%rcx),%ymm0
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm2
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm3
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm4
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm5
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rcx),%ymm13
+ movq %r12,%r9
+ vpaddq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm11,%ymm12,%ymm12
+ addq (%rsp),%r9
+ vpaddq %ymm12,%ymm8,%ymm7
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovq %r12,%xmm12
+ vpaddq %ymm13,%ymm9,%ymm8
+
+ decl %r14d
+ jnz .Loop_mul_1024
+ vpaddq (%rsp),%ymm12,%ymm0
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm10,%ymm10
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpermq $0x93,%ymm11,%ymm11
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm10,%ymm10
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vmovdqu %ymm0,0-128(%rdi)
+ vmovdqu %ymm1,32-128(%rdi)
+ vmovdqu %ymm2,64-128(%rdi)
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vmovdqu %ymm4,128-128(%rdi)
+ vmovdqu %ymm5,160-128(%rdi)
+ vmovdqu %ymm6,192-128(%rdi)
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+ vzeroupper
+
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_1024_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,@function
+.align 32
rsaz_1024_red2norm_avx2:
+.cfi_startproc
+ subq $-128,%rsi
+ xorq %rax,%rax
+ movq -128(%rsi),%r8
+ movq -120(%rsi),%r9
+ movq -112(%rsi),%r10
+ shlq $0,%r8
+ shlq $29,%r9
+ movq %r10,%r11
+ shlq $58,%r10
+ shrq $6,%r11
+ addq %r8,%rax
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,0(%rdi)
+ movq %r11,%rax
+ movq -104(%rsi),%r8
+ movq -96(%rsi),%r9
+ shlq $23,%r8
+ movq %r9,%r10
+ shlq $52,%r9
+ shrq $12,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,8(%rdi)
+ movq %r10,%rax
+ movq -88(%rsi),%r11
+ movq -80(%rsi),%r8
+ shlq $17,%r11
+ movq %r8,%r9
+ shlq $46,%r8
+ shrq $18,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,16(%rdi)
+ movq %r9,%rax
+ movq -72(%rsi),%r10
+ movq -64(%rsi),%r11
+ shlq $11,%r10
+ movq %r11,%r8
+ shlq $40,%r11
+ shrq $24,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,24(%rdi)
+ movq %r8,%rax
+ movq -56(%rsi),%r9
+ movq -48(%rsi),%r10
+ movq -40(%rsi),%r11
+ shlq $5,%r9
+ shlq $34,%r10
+ movq %r11,%r8
+ shlq $63,%r11
+ shrq $1,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,32(%rdi)
+ movq %r8,%rax
+ movq -32(%rsi),%r9
+ movq -24(%rsi),%r10
+ shlq $28,%r9
+ movq %r10,%r11
+ shlq $57,%r10
+ shrq $7,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,40(%rdi)
+ movq %r11,%rax
+ movq -16(%rsi),%r8
+ movq -8(%rsi),%r9
+ shlq $22,%r8
+ movq %r9,%r10
+ shlq $51,%r9
+ shrq $13,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,48(%rdi)
+ movq %r10,%rax
+ movq 0(%rsi),%r11
+ movq 8(%rsi),%r8
+ shlq $16,%r11
+ movq %r8,%r9
+ shlq $45,%r8
+ shrq $19,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,56(%rdi)
+ movq %r9,%rax
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ shlq $10,%r10
+ movq %r11,%r8
+ shlq $39,%r11
+ shrq $25,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,64(%rdi)
+ movq %r8,%rax
+ movq 32(%rsi),%r9
+ movq 40(%rsi),%r10
+ movq 48(%rsi),%r11
+ shlq $4,%r9
+ shlq $33,%r10
+ movq %r11,%r8
+ shlq $62,%r11
+ shrq $2,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,72(%rdi)
+ movq %r8,%rax
+ movq 56(%rsi),%r9
+ movq 64(%rsi),%r10
+ shlq $27,%r9
+ movq %r10,%r11
+ shlq $56,%r10
+ shrq $8,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,80(%rdi)
+ movq %r11,%rax
+ movq 72(%rsi),%r8
+ movq 80(%rsi),%r9
+ shlq $21,%r8
+ movq %r9,%r10
+ shlq $50,%r9
+ shrq $14,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,88(%rdi)
+ movq %r10,%rax
+ movq 88(%rsi),%r11
+ movq 96(%rsi),%r8
+ shlq $15,%r11
+ movq %r8,%r9
+ shlq $44,%r8
+ shrq $20,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,96(%rdi)
+ movq %r9,%rax
+ movq 104(%rsi),%r10
+ movq 112(%rsi),%r11
+ shlq $9,%r10
+ movq %r11,%r8
+ shlq $38,%r11
+ shrq $26,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,104(%rdi)
+ movq %r8,%rax
+ movq 120(%rsi),%r9
+ movq 128(%rsi),%r10
+ movq 136(%rsi),%r11
+ shlq $3,%r9
+ shlq $32,%r10
+ movq %r11,%r8
+ shlq $61,%r11
+ shrq $3,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,112(%rdi)
+ movq %r8,%rax
+ movq 144(%rsi),%r9
+ movq 152(%rsi),%r10
+ shlq $26,%r9
+ movq %r10,%r11
+ shlq $55,%r10
+ shrq $9,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,120(%rdi)
+ movq %r11,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,@function
+.align 32
+rsaz_1024_norm2red_avx2:
+.cfi_startproc
+ subq $-128,%rdi
+ movq (%rsi),%r8
+ movl $0x1fffffff,%eax
+ movq 8(%rsi),%r9
+ movq %r8,%r11
+ shrq $0,%r11
+ andq %rax,%r11
+ movq %r11,-128(%rdi)
+ movq %r8,%r10
+ shrq $29,%r10
+ andq %rax,%r10
+ movq %r10,-120(%rdi)
+ shrdq $58,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-112(%rdi)
+ movq 16(%rsi),%r10
+ movq %r9,%r8
+ shrq $23,%r8
+ andq %rax,%r8
+ movq %r8,-104(%rdi)
+ shrdq $52,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-96(%rdi)
+ movq 24(%rsi),%r11
+ movq %r10,%r9
+ shrq $17,%r9
+ andq %rax,%r9
+ movq %r9,-88(%rdi)
+ shrdq $46,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-80(%rdi)
+ movq 32(%rsi),%r8
+ movq %r11,%r10
+ shrq $11,%r10
+ andq %rax,%r10
+ movq %r10,-72(%rdi)
+ shrdq $40,%r8,%r11
+ andq %rax,%r11
+ movq %r11,-64(%rdi)
+ movq 40(%rsi),%r9
+ movq %r8,%r11
+ shrq $5,%r11
+ andq %rax,%r11
+ movq %r11,-56(%rdi)
+ movq %r8,%r10
+ shrq $34,%r10
+ andq %rax,%r10
+ movq %r10,-48(%rdi)
+ shrdq $63,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-40(%rdi)
+ movq 48(%rsi),%r10
+ movq %r9,%r8
+ shrq $28,%r8
+ andq %rax,%r8
+ movq %r8,-32(%rdi)
+ shrdq $57,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-24(%rdi)
+ movq 56(%rsi),%r11
+ movq %r10,%r9
+ shrq $22,%r9
+ andq %rax,%r9
+ movq %r9,-16(%rdi)
+ shrdq $51,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-8(%rdi)
+ movq 64(%rsi),%r8
+ movq %r11,%r10
+ shrq $16,%r10
+ andq %rax,%r10
+ movq %r10,0(%rdi)
+ shrdq $45,%r8,%r11
+ andq %rax,%r11
+ movq %r11,8(%rdi)
+ movq 72(%rsi),%r9
+ movq %r8,%r11
+ shrq $10,%r11
+ andq %rax,%r11
+ movq %r11,16(%rdi)
+ shrdq $39,%r9,%r8
+ andq %rax,%r8
+ movq %r8,24(%rdi)
+ movq 80(%rsi),%r10
+ movq %r9,%r8
+ shrq $4,%r8
+ andq %rax,%r8
+ movq %r8,32(%rdi)
+ movq %r9,%r11
+ shrq $33,%r11
+ andq %rax,%r11
+ movq %r11,40(%rdi)
+ shrdq $62,%r10,%r9
+ andq %rax,%r9
+ movq %r9,48(%rdi)
+ movq 88(%rsi),%r11
+ movq %r10,%r9
+ shrq $27,%r9
+ andq %rax,%r9
+ movq %r9,56(%rdi)
+ shrdq $56,%r11,%r10
+ andq %rax,%r10
+ movq %r10,64(%rdi)
+ movq 96(%rsi),%r8
+ movq %r11,%r10
+ shrq $21,%r10
+ andq %rax,%r10
+ movq %r10,72(%rdi)
+ shrdq $50,%r8,%r11
+ andq %rax,%r11
+ movq %r11,80(%rdi)
+ movq 104(%rsi),%r9
+ movq %r8,%r11
+ shrq $15,%r11
+ andq %rax,%r11
+ movq %r11,88(%rdi)
+ shrdq $44,%r9,%r8
+ andq %rax,%r8
+ movq %r8,96(%rdi)
+ movq 112(%rsi),%r10
+ movq %r9,%r8
+ shrq $9,%r8
+ andq %rax,%r8
+ movq %r8,104(%rdi)
+ shrdq $38,%r10,%r9
+ andq %rax,%r9
+ movq %r9,112(%rdi)
+ movq 120(%rsi),%r11
+ movq %r10,%r9
+ shrq $3,%r9
+ andq %rax,%r9
+ movq %r9,120(%rdi)
+ movq %r10,%r8
+ shrq $32,%r8
+ andq %rax,%r8
+ movq %r8,128(%rdi)
+ shrdq $61,%r11,%r10
+ andq %rax,%r10
+ movq %r10,136(%rdi)
+ xorq %r8,%r8
+ movq %r11,%r10
+ shrq $26,%r10
+ andq %rax,%r10
+ movq %r10,144(%rdi)
+ shrdq $55,%r8,%r11
+ andq %rax,%r11
+ movq %r11,152(%rdi)
+ movq %r8,160(%rdi)
+ movq %r8,168(%rdi)
+ movq %r8,176(%rdi)
+ movq %r8,184(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,@function
+.align 32
rsaz_1024_scatter5_avx2:
+.cfi_startproc
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shll $4,%edx
+ leaq (%rdi,%rdx,1),%rdi
+ movl $9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu (%rsi),%ymm0
+ leaq 32(%rsi),%rsi
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,(%rdi)
+ leaq 512(%rdi),%rdi
+ decl %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,@function
+.align 32
rsaz_1024_gather5_avx2:
-.byte 0x0f,0x0b
+.cfi_startproc
+ vzeroupper
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ leaq -256(%rsp),%rsp
+ andq $-32,%rsp
+ leaq .Linc(%rip),%r10
+ leaq -128(%rsp),%rax
+
+ vmovd %edx,%xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,0+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm0
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,32+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm1
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,64+128(%rax)
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vmovdqa %ymm3,96+128(%rax)
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,128+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm8
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,160+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm9
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,192+128(%rax)
+ vpaddd %ymm5,%ymm8,%ymm10
+ vpcmpeqd %ymm4,%ymm8,%ymm8
+ vmovdqa %ymm3,224+128(%rax)
+ vpaddd %ymm5,%ymm9,%ymm11
+ vpcmpeqd %ymm4,%ymm9,%ymm9
+ vpaddd %ymm5,%ymm10,%ymm12
+ vpcmpeqd %ymm4,%ymm10,%ymm10
+ vpaddd %ymm5,%ymm11,%ymm13
+ vpcmpeqd %ymm4,%ymm11,%ymm11
+ vpaddd %ymm5,%ymm12,%ymm14
+ vpcmpeqd %ymm4,%ymm12,%ymm12
+ vpaddd %ymm5,%ymm13,%ymm15
+ vpcmpeqd %ymm4,%ymm13,%ymm13
+ vpcmpeqd %ymm4,%ymm14,%ymm14
+ vpcmpeqd %ymm4,%ymm15,%ymm15
+
+ vmovdqa -32(%r10),%ymm7
+ leaq 128(%rsi),%rsi
+ movl $9,%edx
+
+.Loop_gather_1024:
+ vmovdqa 0-128(%rsi),%ymm0
+ vmovdqa 32-128(%rsi),%ymm1
+ vmovdqa 64-128(%rsi),%ymm2
+ vmovdqa 96-128(%rsi),%ymm3
+ vpand 0+128(%rax),%ymm0,%ymm0
+ vpand 32+128(%rax),%ymm1,%ymm1
+ vpand 64+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm1,%ymm4
+ vpand 96+128(%rax),%ymm3,%ymm3
+ vmovdqa 128-128(%rsi),%ymm0
+ vmovdqa 160-128(%rsi),%ymm1
+ vpor %ymm2,%ymm3,%ymm5
+ vmovdqa 192-128(%rsi),%ymm2
+ vmovdqa 224-128(%rsi),%ymm3
+ vpand 128+128(%rax),%ymm0,%ymm0
+ vpand 160+128(%rax),%ymm1,%ymm1
+ vpand 192+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 224+128(%rax),%ymm3,%ymm3
+ vpand 256-128(%rsi),%ymm8,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 288-128(%rsi),%ymm9,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 320-128(%rsi),%ymm10,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 352-128(%rsi),%ymm11,%ymm3
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 384-128(%rsi),%ymm12,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 416-128(%rsi),%ymm13,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 448-128(%rsi),%ymm14,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 480-128(%rsi),%ymm15,%ymm3
+ leaq 512(%rsi),%rsi
+ vpor %ymm0,%ymm4,%ymm4
+ vpor %ymm1,%ymm5,%ymm5
+ vpor %ymm2,%ymm4,%ymm4
+ vpor %ymm3,%ymm5,%ymm5
+
+ vpor %ymm5,%ymm4,%ymm4
+ vextracti128 $1,%ymm4,%xmm5
+ vpor %xmm4,%xmm5,%xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,(%rdi)
+ leaq 32(%rdi),%rdi
+ decl %edx
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ vzeroupper
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
.byte 0xf3,0xc3
-.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.cfi_endproc
+.LSEH_end_rsaz_1024_gather5:
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,@function
+.align 32
+rsaz_avx2_eligible:
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ movl $524544,%ecx
+ movl $0,%edx
+ andl %eax,%ecx
+ cmpl $524544,%ecx
+ cmovel %edx,%eax
+ andl $32,%eax
+ shrl $5,%eax
+ .byte 0xf3,0xc3
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align 64
+.Land_mask:
+.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long 0,7,1,7,2,7,3,7
+.Linc:
+.long 0,0,0,0, 1,1,1,1
+.long 2,2,2,2, 3,3,3,3
+.long 4,4,4,4, 4,4,4,4
+.align 64
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
index ae64f7a73987..3ba29ea52dd9 100644
--- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S
+++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
@@ -35,6 +35,10 @@ rsaz_512_sqr:
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Loop_sqrx
jmp .Loop_sqr
.align 32
@@ -405,6 +409,282 @@ rsaz_512_sqr:
decl %r8d
jnz .Loop_sqr
+ jmp .Lsqr_tail
+
+.align 32
+.Loop_sqrx:
+ movl %r8d,128+8(%rsp)
+.byte 102,72,15,110,199
+
+ mulxq %rax,%r8,%r9
+ movq %rax,%rbx
+
+ mulxq 16(%rsi),%rcx,%r10
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rcx,%r9
+
+.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
+ adcxq %rax,%r10
+
+.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
+ adcxq %rcx,%r11
+
+ mulxq 48(%rsi),%rcx,%r14
+ adcxq %rax,%r12
+ adcxq %rcx,%r13
+
+ mulxq 56(%rsi),%rax,%r15
+ adcxq %rax,%r14
+ adcxq %rbp,%r15
+
+ mulxq %rdx,%rax,%rdi
+ movq %rbx,%rdx
+ xorq %rcx,%rcx
+ adoxq %r8,%r8
+ adcxq %rdi,%r8
+ adoxq %rbp,%rcx
+ adcxq %rbp,%rcx
+
+ movq %rax,(%rsp)
+ movq %r8,8(%rsp)
+
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
+ adoxq %rax,%r10
+ adcxq %rbx,%r11
+
+ mulxq 24(%rsi),%rdi,%r8
+ adoxq %rdi,%r11
+.byte 0x66
+ adcxq %r8,%r12
+
+ mulxq 32(%rsi),%rax,%rbx
+ adoxq %rax,%r12
+ adcxq %rbx,%r13
+
+ mulxq 40(%rsi),%rdi,%r8
+ adoxq %rdi,%r13
+ adcxq %r8,%r14
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adoxq %rax,%r14
+ adcxq %rbx,%r15
+
+.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
+ adoxq %rdi,%r15
+ adcxq %rbp,%r8
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r8
+.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
+
+ xorq %rbx,%rbx
+ adoxq %r9,%r9
+
+ adcxq %rcx,%rax
+ adoxq %r10,%r10
+ adcxq %rax,%r9
+ adoxq %rbp,%rbx
+ adcxq %rdi,%r10
+ adcxq %rbp,%rbx
+
+ movq %r9,16(%rsp)
+.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
+
+
+ mulxq 24(%rsi),%rdi,%r9
+ adoxq %rdi,%r12
+ adcxq %r9,%r13
+
+ mulxq 32(%rsi),%rax,%rcx
+ adoxq %rax,%r13
+ adcxq %rcx,%r14
+
+.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
+ adoxq %rdi,%r14
+ adcxq %r9,%r15
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
+ adoxq %rax,%r15
+ adcxq %rcx,%r8
+
+ mulxq 56(%rsi),%rdi,%r9
+ adoxq %rdi,%r8
+ adcxq %rbp,%r9
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r9
+ movq 24(%rsi),%rdx
+
+ xorq %rcx,%rcx
+ adoxq %r11,%r11
+
+ adcxq %rbx,%rax
+ adoxq %r12,%r12
+ adcxq %rax,%r11
+ adoxq %rbp,%rcx
+ adcxq %rdi,%r12
+ adcxq %rbp,%rcx
+
+ movq %r11,32(%rsp)
+ movq %r12,40(%rsp)
+
+
+ mulxq 32(%rsi),%rax,%rbx
+ adoxq %rax,%r14
+ adcxq %rbx,%r15
+
+ mulxq 40(%rsi),%rdi,%r10
+ adoxq %rdi,%r15
+ adcxq %r10,%r8
+
+ mulxq 48(%rsi),%rax,%rbx
+ adoxq %rax,%r8
+ adcxq %rbx,%r9
+
+ mulxq 56(%rsi),%rdi,%r10
+ adoxq %rdi,%r9
+ adcxq %rbp,%r10
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r10
+ movq 32(%rsi),%rdx
+
+ xorq %rbx,%rbx
+ adoxq %r13,%r13
+
+ adcxq %rcx,%rax
+ adoxq %r14,%r14
+ adcxq %rax,%r13
+ adoxq %rbp,%rbx
+ adcxq %rdi,%r14
+ adcxq %rbp,%rbx
+
+ movq %r13,48(%rsp)
+ movq %r14,56(%rsp)
+
+
+ mulxq 40(%rsi),%rdi,%r11
+ adoxq %rdi,%r8
+ adcxq %r11,%r9
+
+ mulxq 48(%rsi),%rax,%rcx
+ adoxq %rax,%r9
+ adcxq %rcx,%r10
+
+ mulxq 56(%rsi),%rdi,%r11
+ adoxq %rdi,%r10
+ adcxq %rbp,%r11
+ mulxq %rdx,%rax,%rdi
+ movq 40(%rsi),%rdx
+ adoxq %rbp,%r11
+
+ xorq %rcx,%rcx
+ adoxq %r15,%r15
+
+ adcxq %rbx,%rax
+ adoxq %r8,%r8
+ adcxq %rax,%r15
+ adoxq %rbp,%rcx
+ adcxq %rdi,%r8
+ adcxq %rbp,%rcx
+
+ movq %r15,64(%rsp)
+ movq %r8,72(%rsp)
+
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adoxq %rax,%r10
+ adcxq %rbx,%r11
+
+.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
+ adoxq %rdi,%r11
+ adcxq %rbp,%r12
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r12
+ movq 48(%rsi),%rdx
+
+ xorq %rbx,%rbx
+ adoxq %r9,%r9
+
+ adcxq %rcx,%rax
+ adoxq %r10,%r10
+ adcxq %rax,%r9
+ adcxq %rdi,%r10
+ adoxq %rbp,%rbx
+ adcxq %rbp,%rbx
+
+ movq %r9,80(%rsp)
+ movq %r10,88(%rsp)
+
+
+.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
+ adoxq %rax,%r12
+ adoxq %rbp,%r13
+
+ mulxq %rdx,%rax,%rdi
+ xorq %rcx,%rcx
+ movq 56(%rsi),%rdx
+ adoxq %r11,%r11
+
+ adcxq %rbx,%rax
+ adoxq %r12,%r12
+ adcxq %rax,%r11
+ adoxq %rbp,%rcx
+ adcxq %rdi,%r12
+ adcxq %rbp,%rcx
+
+.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
+.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
+
+
+ mulxq %rdx,%rax,%rdx
+ xorq %rbx,%rbx
+ adoxq %r13,%r13
+
+ adcxq %rcx,%rax
+ adoxq %rbp,%rbx
+ adcxq %r13,%rax
+ adcxq %rdx,%rbx
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq 128(%rsp),%rdx
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ movq %rax,112(%rsp)
+ movq %rbx,120(%rsp)
+
+ call __rsaz_512_reducex
+
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8,%rdx
+ movq %r9,%rax
+ movl 128+8(%rsp),%r8d
+ movq %rdi,%rsi
+
+ decl %r8d
+ jnz .Loop_sqrx
+
+.Lsqr_tail:
leaq 128+24+48(%rsp),%rax
.cfi_def_cfa %rax,8
@@ -456,6 +736,10 @@ rsaz_512_mul:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@@ -473,6 +757,29 @@ rsaz_512_mul:
movq 56(%rsp),%r15
call __rsaz_512_reduce
+ jmp .Lmul_tail
+
+.align 32
+.Lmulx:
+ movq %rdx,%rbp
+ movq (%rdx),%rdx
+ call __rsaz_512_mulx
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq 128(%rsp),%rdx
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reducex
+.Lmul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -586,6 +893,10 @@ rsaz_512_mul_gather4:
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx_gather
.byte 102,76,15,126,195
movq %r8,128(%rsp)
@@ -766,6 +1077,142 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
+ jmp .Lmul_gather_tail
+
+.align 32
+.Lmulx_gather:
+.byte 102,76,15,126,194
+
+ movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
+
+ mulxq (%rsi),%rbx,%r8
+ movq %rbx,(%rsp)
+ xorl %edi,%edi
+
+ mulxq 8(%rsi),%rax,%r9
+
+ mulxq 16(%rsi),%rbx,%r10
+ adcxq %rax,%r8
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rbx,%r9
+
+ mulxq 32(%rsi),%rbx,%r12
+ adcxq %rax,%r10
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rbx,%r11
+
+ mulxq 48(%rsi),%rbx,%r14
+ adcxq %rax,%r12
+
+ mulxq 56(%rsi),%rax,%r15
+ adcxq %rbx,%r13
+ adcxq %rax,%r14
+.byte 0x67
+ movq %r8,%rbx
+ adcxq %rdi,%r15
+
+ movq $-7,%rcx
+ jmp .Loop_mulx_gather
+
+.align 32
+.Loop_mulx_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,194
+
+.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rsi),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rsi),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+ mulxq 32(%rsi),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
+ adcxq %rax,%r13
+.byte 0x67
+ adoxq %r15,%r14
+
+ mulxq 56(%rsi),%rax,%r15
+ movq %rbx,64(%rsp,%rcx,8)
+ adcxq %rax,%r14
+ adoxq %rdi,%r15
+ movq %r8,%rbx
+ adcxq %rdi,%r15
+
+ incq %rcx
+ jnz .Loop_mulx_gather
+
+ movq %r8,64(%rsp)
+ movq %r9,64+8(%rsp)
+ movq %r10,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+ movq %r12,64+32(%rsp)
+ movq %r13,64+40(%rsp)
+ movq %r14,64+48(%rsp)
+ movq %r15,64+56(%rsp)
+
+ movq 128(%rsp),%rdx
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reducex
+
+.Lmul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -833,6 +1280,10 @@ rsaz_512_mul_scatter4:
movq %rcx,128(%rsp)
movq %rdi,%rbp
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@@ -849,6 +1300,29 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
+ jmp .Lmul_scatter_tail
+
+.align 32
+.Lmulx_scatter:
+ movq (%rdi),%rdx
+ call __rsaz_512_mulx
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq 128(%rsp),%rdx
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reducex
+
+.Lmul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -918,6 +1392,7 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp
.cfi_adjust_cfa_offset 128+24
.Lmul_by_one_body:
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@@ -938,7 +1413,16 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
+ andl $0x80100,%eax
+ cmpl $0x80100,%eax
+ je .Lby_one_callx
call __rsaz_512_reduce
+ jmp .Lby_one_tail
+.align 32
+.Lby_one_callx:
+ movq 128(%rsp),%rdx
+ call __rsaz_512_reducex
+.Lby_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1053,6 +1537,64 @@ __rsaz_512_reduce:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_reduce,.-__rsaz_512_reduce
+.type __rsaz_512_reducex,@function
+.align 32
+__rsaz_512_reducex:
+.cfi_startproc
+
+ imulq %r8,%rdx
+ xorq %rsi,%rsi
+ movl $8,%ecx
+ jmp .Lreduction_loopx
+
+.align 32
+.Lreduction_loopx:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 128+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+ decl %ecx
+ jne .Lreduction_loopx
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function
.align 32
__rsaz_512_subtract:
@@ -1256,6 +1798,128 @@ __rsaz_512_mul:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_mul,.-__rsaz_512_mul
+.type __rsaz_512_mulx,@function
+.align 32
+__rsaz_512_mulx:
+.cfi_startproc
+ mulxq (%rsi),%rbx,%r8
+ movq $-6,%rcx
+
+ mulxq 8(%rsi),%rax,%r9
+ movq %rbx,8(%rsp)
+
+ mulxq 16(%rsi),%rbx,%r10
+ adcq %rax,%r8
+
+ mulxq 24(%rsi),%rax,%r11
+ adcq %rbx,%r9
+
+ mulxq 32(%rsi),%rbx,%r12
+ adcq %rax,%r10
+
+ mulxq 40(%rsi),%rax,%r13
+ adcq %rbx,%r11
+
+ mulxq 48(%rsi),%rbx,%r14
+ adcq %rax,%r12
+
+ mulxq 56(%rsi),%rax,%r15
+ movq 8(%rbp),%rdx
+ adcq %rbx,%r13
+ adcq %rax,%r14
+ adcq $0,%r15
+
+ xorq %rdi,%rdi
+ jmp .Loop_mulx
+
+.align 32
+.Loop_mulx:
+ movq %r8,%rbx
+ mulxq (%rsi),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rsi),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rsi),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rsi),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rsi),%rax,%r15
+ movq 64(%rbp,%rcx,8),%rdx
+ movq %rbx,8+64-8(%rsp,%rcx,8)
+ adcxq %rax,%r14
+ adoxq %rdi,%r15
+ adcxq %rdi,%r15
+
+ incq %rcx
+ jnz .Loop_mulx
+
+ movq %r8,%rbx
+ mulxq (%rsi),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+ mulxq 32(%rsi),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
+ adcxq %rax,%r14
+ adoxq %rdi,%r15
+ adcxq %rdi,%r15
+
+ movq %rbx,8+64-8(%rsp)
+ movq %r8,8+64(%rsp)
+ movq %r9,8+64+8(%rsp)
+ movq %r10,8+64+16(%rsp)
+ movq %r11,8+64+24(%rsp)
+ movq %r12,8+64+32(%rsp)
+ movq %r13,8+64+40(%rsp)
+ movq %r14,8+64+48(%rsp)
+ movq %r15,8+64+56(%rsp)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function
.align 16
diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
index 488e554c247e..0090e020c573 100644
--- a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
@@ -12,6 +12,8 @@ sha1_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
+ testl $268435456,%ecx
+ jnz _avx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -2937,6 +2939,4319 @@ _shaext_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
+.type sha1_multi_block_avx,@function
+.align 32
+sha1_multi_block_avx:
+.cfi_startproc
+_avx_shortcut:
+ shrq $32,%rcx
+ cmpl $2,%edx
+ jb .Lavx
+ testl $32,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody_avx:
+ leaq K_XX_XX(%rip),%rbp
+ leaq 256(%rsp),%rbx
+
+ vzeroupper
+.Loop_grande_avx:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone_avx
+
+ vmovdqu 0(%rdi),%xmm10
+ leaq 128(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm11
+ vmovdqu 64(%rdi),%xmm12
+ vmovdqu 96(%rdi),%xmm13
+ vmovdqu 128(%rdi),%xmm14
+ vmovdqu 96(%rbp),%xmm5
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vmovdqa -32(%rbp),%xmm15
+ vmovd (%r8),%xmm0
+ leaq 64(%r8),%r8
+ vmovd (%r9),%xmm2
+ leaq 64(%r9),%r9
+ vpinsrd $1,(%r10),%xmm0,%xmm0
+ leaq 64(%r10),%r10
+ vpinsrd $1,(%r11),%xmm2,%xmm2
+ leaq 64(%r11),%r11
+ vmovd -60(%r8),%xmm1
+ vpunpckldq %xmm2,%xmm0,%xmm0
+ vmovd -60(%r9),%xmm9
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpinsrd $1,-60(%r10),%xmm1,%xmm1
+ vpinsrd $1,-60(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,0-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -56(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -56(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-56(%r10),%xmm2,%xmm2
+ vpinsrd $1,-56(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,16-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -52(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -52(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-52(%r10),%xmm3,%xmm3
+ vpinsrd $1,-52(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,32-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -48(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -48(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-48(%r10),%xmm4,%xmm4
+ vpinsrd $1,-48(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,48-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -44(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -44(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpinsrd $1,-44(%r10),%xmm0,%xmm0
+ vpinsrd $1,-44(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,64-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -40(%r8),%xmm1
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -40(%r9),%xmm9
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpinsrd $1,-40(%r10),%xmm1,%xmm1
+ vpinsrd $1,-40(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,80-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -36(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -36(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-36(%r10),%xmm2,%xmm2
+ vpinsrd $1,-36(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,96-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -32(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -32(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-32(%r10),%xmm3,%xmm3
+ vpinsrd $1,-32(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,112-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -28(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -28(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-28(%r10),%xmm4,%xmm4
+ vpinsrd $1,-28(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,128-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -24(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -24(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpinsrd $1,-24(%r10),%xmm0,%xmm0
+ vpinsrd $1,-24(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,144-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -20(%r8),%xmm1
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -20(%r9),%xmm9
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpinsrd $1,-20(%r10),%xmm1,%xmm1
+ vpinsrd $1,-20(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,160-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -16(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -16(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-16(%r10),%xmm2,%xmm2
+ vpinsrd $1,-16(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,176-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -12(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -12(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-12(%r10),%xmm3,%xmm3
+ vpinsrd $1,-12(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,192-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -8(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -8(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-8(%r10),%xmm4,%xmm4
+ vpinsrd $1,-8(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,208-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -4(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -4(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vmovdqa 0-128(%rax),%xmm1
+ vpinsrd $1,-4(%r10),%xmm0,%xmm0
+ vpinsrd $1,-4(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ prefetcht0 63(%r8)
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,224-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ prefetcht0 63(%r9)
+ vpxor %xmm7,%xmm6,%xmm6
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ prefetcht0 63(%r10)
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ prefetcht0 63(%r11)
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 16-128(%rax),%xmm2
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 32-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,240-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 128-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 48-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,0-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 144-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 64-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,16-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 160-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 80-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,32-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 176-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 96-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,48-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 192-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 0(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 112-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,64-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 208-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 128-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,80-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 224-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 144-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,96-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 240-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 160-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,112-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 0-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 176-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,128-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 16-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 192-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,144-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 32-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 208-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,160-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 48-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 224-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,176-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 64-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 240-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,192-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 80-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 0-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,208-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 96-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 16-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,224-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 112-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 32-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,240-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 128-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 48-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,0-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 144-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 64-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,16-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 160-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 80-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,32-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 176-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 96-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,48-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 192-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 112-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,64-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 208-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 128-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,80-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 224-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 144-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,96-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 240-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 160-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,112-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 0-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 32(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 176-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 16-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,128-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 192-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 32-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,144-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 208-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 48-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,160-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 224-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 64-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,176-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 240-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 80-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,192-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 0-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 96-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,208-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 16-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 112-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,224-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 32-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 128-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,240-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 48-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 144-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,0-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 64-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 160-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,16-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 80-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 176-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,32-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 96-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 192-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,48-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 112-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 208-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,64-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 128-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 224-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,80-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 144-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 240-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,96-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 160-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 0-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,112-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 176-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 16-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,128-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 192-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 32-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,144-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 208-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 48-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,160-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 224-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 64-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,176-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 64(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 240-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,192-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 80-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 0-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,208-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 96-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 16-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,224-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 112-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 32-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,240-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 128-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 48-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,0-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 144-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 64-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,16-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 160-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 80-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,32-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 176-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 96-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,48-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 192-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 112-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,64-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 208-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 128-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,80-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 224-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 144-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,96-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 240-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 160-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,112-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 0-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 176-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 16-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 192-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 32-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 208-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 48-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 224-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 64-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 240-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 80-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 0-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 96-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 16-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 112-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+
+ vpsrld $27,%xmm11,%xmm9
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor %xmm13,%xmm6,%xmm6
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm7,%xmm12,%xmm12
+ movl $1,%ecx
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqu (%rbx),%xmm6
+ vpxor %xmm8,%xmm8,%xmm8
+ vmovdqa %xmm6,%xmm7
+ vpcmpgtd %xmm8,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpand %xmm7,%xmm10,%xmm10
+ vpand %xmm7,%xmm11,%xmm11
+ vpaddd 0(%rdi),%xmm10,%xmm10
+ vpand %xmm7,%xmm12,%xmm12
+ vpaddd 32(%rdi),%xmm11,%xmm11
+ vpand %xmm7,%xmm13,%xmm13
+ vpaddd 64(%rdi),%xmm12,%xmm12
+ vpand %xmm7,%xmm14,%xmm14
+ vpaddd 96(%rdi),%xmm13,%xmm13
+ vpaddd 128(%rdi),%xmm14,%xmm14
+ vmovdqu %xmm10,0(%rdi)
+ vmovdqu %xmm11,32(%rdi)
+ vmovdqu %xmm12,64(%rdi)
+ vmovdqu %xmm13,96(%rdi)
+ vmovdqu %xmm14,128(%rdi)
+
+ vmovdqu %xmm6,(%rbx)
+ vmovdqu 96(%rbp),%xmm5
+ decl %edx
+ jnz .Loop_avx
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_avx,.-sha1_multi_block_avx
+.type sha1_multi_block_avx2,@function
+.align 32
+sha1_multi_block_avx2:
+.cfi_startproc
+_avx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-256,%rsp
+ movq %rax,544(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
+.Lbody_avx2:
+ leaq K_XX_XX(%rip),%rbp
+ shrl $1,%edx
+
+ vzeroupper
+.Loop_grande_avx2:
+ movl %edx,552(%rsp)
+ xorl %edx,%edx
+ leaq 512(%rsp),%rbx
+ movq 0(%rsi),%r12
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r12
+ movq 16(%rsi),%r13
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r13
+ movq 32(%rsi),%r14
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r14
+ movq 48(%rsi),%r15
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r15
+ movq 64(%rsi),%r8
+ movl 72(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,16(%rbx)
+ cmovleq %rbp,%r8
+ movq 80(%rsi),%r9
+ movl 88(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,20(%rbx)
+ cmovleq %rbp,%r9
+ movq 96(%rsi),%r10
+ movl 104(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,24(%rbx)
+ cmovleq %rbp,%r10
+ movq 112(%rsi),%r11
+ movl 120(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,28(%rbx)
+ cmovleq %rbp,%r11
+ vmovdqu 0(%rdi),%ymm0
+ leaq 128(%rsp),%rax
+ vmovdqu 32(%rdi),%ymm1
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 64(%rdi),%ymm2
+ vmovdqu 96(%rdi),%ymm3
+ vmovdqu 128(%rdi),%ymm4
+ vmovdqu 96(%rbp),%ymm9
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ vmovdqa -32(%rbp),%ymm15
+ vmovd (%r12),%xmm10
+ leaq 64(%r12),%r12
+ vmovd (%r8),%xmm12
+ leaq 64(%r8),%r8
+ vmovd (%r13),%xmm7
+ leaq 64(%r13),%r13
+ vmovd (%r9),%xmm6
+ leaq 64(%r9),%r9
+ vpinsrd $1,(%r14),%xmm10,%xmm10
+ leaq 64(%r14),%r14
+ vpinsrd $1,(%r10),%xmm12,%xmm12
+ leaq 64(%r10),%r10
+ vpinsrd $1,(%r15),%xmm7,%xmm7
+ leaq 64(%r15),%r15
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,(%r11),%xmm6,%xmm6
+ leaq 64(%r11),%r11
+ vpunpckldq %ymm6,%ymm12,%ymm12
+ vmovd -60(%r12),%xmm11
+ vinserti128 $1,%xmm12,%ymm10,%ymm10
+ vmovd -60(%r8),%xmm8
+ vpshufb %ymm9,%ymm10,%ymm10
+ vmovd -60(%r13),%xmm7
+ vmovd -60(%r9),%xmm6
+ vpinsrd $1,-60(%r14),%xmm11,%xmm11
+ vpinsrd $1,-60(%r10),%xmm8,%xmm8
+ vpinsrd $1,-60(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-60(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,0-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -56(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -56(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -56(%r13),%xmm7
+ vmovd -56(%r9),%xmm6
+ vpinsrd $1,-56(%r14),%xmm12,%xmm12
+ vpinsrd $1,-56(%r10),%xmm8,%xmm8
+ vpinsrd $1,-56(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-56(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,32-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -52(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -52(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -52(%r13),%xmm7
+ vmovd -52(%r9),%xmm6
+ vpinsrd $1,-52(%r14),%xmm13,%xmm13
+ vpinsrd $1,-52(%r10),%xmm8,%xmm8
+ vpinsrd $1,-52(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-52(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,64-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -48(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -48(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -48(%r13),%xmm7
+ vmovd -48(%r9),%xmm6
+ vpinsrd $1,-48(%r14),%xmm14,%xmm14
+ vpinsrd $1,-48(%r10),%xmm8,%xmm8
+ vpinsrd $1,-48(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-48(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,96-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -44(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -44(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovd -44(%r13),%xmm7
+ vmovd -44(%r9),%xmm6
+ vpinsrd $1,-44(%r14),%xmm10,%xmm10
+ vpinsrd $1,-44(%r10),%xmm8,%xmm8
+ vpinsrd $1,-44(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-44(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,128-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -40(%r12),%xmm11
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -40(%r8),%xmm8
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovd -40(%r13),%xmm7
+ vmovd -40(%r9),%xmm6
+ vpinsrd $1,-40(%r14),%xmm11,%xmm11
+ vpinsrd $1,-40(%r10),%xmm8,%xmm8
+ vpinsrd $1,-40(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-40(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,160-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -36(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -36(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -36(%r13),%xmm7
+ vmovd -36(%r9),%xmm6
+ vpinsrd $1,-36(%r14),%xmm12,%xmm12
+ vpinsrd $1,-36(%r10),%xmm8,%xmm8
+ vpinsrd $1,-36(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-36(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,192-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -32(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -32(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -32(%r13),%xmm7
+ vmovd -32(%r9),%xmm6
+ vpinsrd $1,-32(%r14),%xmm13,%xmm13
+ vpinsrd $1,-32(%r10),%xmm8,%xmm8
+ vpinsrd $1,-32(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-32(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,224-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -28(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -28(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -28(%r13),%xmm7
+ vmovd -28(%r9),%xmm6
+ vpinsrd $1,-28(%r14),%xmm14,%xmm14
+ vpinsrd $1,-28(%r10),%xmm8,%xmm8
+ vpinsrd $1,-28(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-28(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,256-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -24(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -24(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovd -24(%r13),%xmm7
+ vmovd -24(%r9),%xmm6
+ vpinsrd $1,-24(%r14),%xmm10,%xmm10
+ vpinsrd $1,-24(%r10),%xmm8,%xmm8
+ vpinsrd $1,-24(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-24(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -20(%r12),%xmm11
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -20(%r8),%xmm8
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovd -20(%r13),%xmm7
+ vmovd -20(%r9),%xmm6
+ vpinsrd $1,-20(%r14),%xmm11,%xmm11
+ vpinsrd $1,-20(%r10),%xmm8,%xmm8
+ vpinsrd $1,-20(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-20(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,320-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -16(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -16(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -16(%r13),%xmm7
+ vmovd -16(%r9),%xmm6
+ vpinsrd $1,-16(%r14),%xmm12,%xmm12
+ vpinsrd $1,-16(%r10),%xmm8,%xmm8
+ vpinsrd $1,-16(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-16(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,352-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -12(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -12(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -12(%r13),%xmm7
+ vmovd -12(%r9),%xmm6
+ vpinsrd $1,-12(%r14),%xmm13,%xmm13
+ vpinsrd $1,-12(%r10),%xmm8,%xmm8
+ vpinsrd $1,-12(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-12(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,384-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -8(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -8(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -8(%r13),%xmm7
+ vmovd -8(%r9),%xmm6
+ vpinsrd $1,-8(%r14),%xmm14,%xmm14
+ vpinsrd $1,-8(%r10),%xmm8,%xmm8
+ vpinsrd $1,-8(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-8(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,416-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -4(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -4(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovdqa 0-128(%rax),%ymm11
+ vmovd -4(%r13),%xmm7
+ vmovd -4(%r9),%xmm6
+ vpinsrd $1,-4(%r14),%xmm10,%xmm10
+ vpinsrd $1,-4(%r10),%xmm8,%xmm8
+ vpinsrd $1,-4(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-4(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ prefetcht0 63(%r12)
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,448-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ prefetcht0 63(%r13)
+ vpxor %ymm6,%ymm5,%ymm5
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ prefetcht0 63(%r14)
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ prefetcht0 63(%r15)
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 32-128(%rax),%ymm12
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 64-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ prefetcht0 63(%r8)
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,480-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 256-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+ prefetcht0 63(%r9)
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ prefetcht0 63(%r10)
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ prefetcht0 63(%r11)
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 96-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,0-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 288-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 128-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,32-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 320-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 160-128(%rax),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,64-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 352-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 192-128(%rax),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,96-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 384-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 0(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 224-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,128-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 416-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 256-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,160-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 448-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 288-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,192-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 480-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 320-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,224-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 0-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 352-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,256-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 32-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 384-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,288-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 64-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 416-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,320-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 96-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 448-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 128-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 480-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,384-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 160-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 0-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,416-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 192-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 32-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,448-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 224-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 64-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,480-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 256-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 96-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,0-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 288-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 128-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,32-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 320-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 160-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,64-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 352-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 192-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,96-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 384-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 224-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,128-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 416-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 256-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,160-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 448-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 288-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,192-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 480-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 320-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,224-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 0-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 32(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 352-256-128(%rbx),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 32-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,256-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 384-256-128(%rbx),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 64-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,288-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 416-256-128(%rbx),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 96-128(%rax),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,320-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 448-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 128-128(%rax),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,352-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 480-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 160-128(%rax),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,384-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 0-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 192-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 32-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 224-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,448-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 64-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 256-256-128(%rbx),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,480-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 96-128(%rax),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 288-256-128(%rbx),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,0-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 128-128(%rax),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 320-256-128(%rbx),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,32-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 160-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 352-256-128(%rbx),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,64-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 192-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 384-256-128(%rbx),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,96-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 224-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 416-256-128(%rbx),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,128-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 256-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 448-256-128(%rbx),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,160-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 288-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 480-256-128(%rbx),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,192-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 320-256-128(%rbx),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 0-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,224-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 352-256-128(%rbx),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 32-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,256-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 384-256-128(%rbx),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 64-128(%rax),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,288-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 416-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 96-128(%rax),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 448-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 128-128(%rax),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,352-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 64(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 480-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,384-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 160-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 0-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,416-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 192-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 32-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,448-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 224-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 64-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,480-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 256-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 96-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,0-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 288-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 128-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,32-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 320-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 160-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,64-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 352-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 192-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,96-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 384-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 224-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,128-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 416-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 256-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,160-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 448-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 288-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,192-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 480-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 320-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,224-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 0-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 352-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 32-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 384-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 64-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 416-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 96-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 448-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 128-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 480-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 160-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 0-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 192-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 32-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 224-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+
+ vpsrld $27,%ymm1,%ymm8
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor %ymm3,%ymm5,%ymm5
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm6,%ymm2,%ymm2
+ movl $1,%ecx
+ leaq 512(%rsp),%rbx
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r12
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r13
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r14
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r15
+ cmpl 16(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 20(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 24(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 28(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqu (%rbx),%ymm5
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm5,%ymm6
+ vpcmpgtd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm5,%ymm5
+
+ vpand %ymm6,%ymm0,%ymm0
+ vpand %ymm6,%ymm1,%ymm1
+ vpaddd 0(%rdi),%ymm0,%ymm0
+ vpand %ymm6,%ymm2,%ymm2
+ vpaddd 32(%rdi),%ymm1,%ymm1
+ vpand %ymm6,%ymm3,%ymm3
+ vpaddd 64(%rdi),%ymm2,%ymm2
+ vpand %ymm6,%ymm4,%ymm4
+ vpaddd 96(%rdi),%ymm3,%ymm3
+ vpaddd 128(%rdi),%ymm4,%ymm4
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm1,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm3,96(%rdi)
+ vmovdqu %ymm4,128(%rdi)
+
+ vmovdqu %ymm5,(%rbx)
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 96(%rbp),%ymm9
+ decl %edx
+ jnz .Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+ movq 544(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
.align 256
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S
index cf36e17d3121..342db5203d16 100644
--- a/secure/lib/libcrypto/amd64/sha1-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S
@@ -15,6 +15,14 @@ sha1_block_data_order:
jz .Lialu
testl $536870912,%r10d
jnz _shaext_shortcut
+ andl $296,%r10d
+ cmpl $296,%r10d
+ je _avx2_shortcut
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
jmp _ssse3_shortcut
.align 16
@@ -2606,6 +2614,2827 @@ _ssse3_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r14),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r14),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r14),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp .Lalign32_1
+.align 32
+.Lalign32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp .Lalign32_2
+.align 32
+.Lalign32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je .Ldone_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp .Lalign32_3
+.align 32
+.Lalign32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
index 63dca42029ea..1c77e3d13a8b 100644
--- a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
@@ -12,6 +12,8 @@ sha256_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
+ testl $268435456,%ecx
+ jnz _avx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -3125,6 +3127,4676 @@ _shaext_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
+.type sha256_multi_block_avx,@function
+.align 32
+sha256_multi_block_avx:
+.cfi_startproc
+_avx_shortcut:
+ shrq $32,%rcx
+ cmpl $2,%edx
+ jb .Lavx
+ testl $32,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody_avx:
+ leaq K256+128(%rip),%rbp
+ leaq 256(%rsp),%rbx
+ leaq 128(%rdi),%rdi
+
+.Loop_grande_avx:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone_avx
+
+ vmovdqu 0-128(%rdi),%xmm8
+ leaq 128(%rsp),%rax
+ vmovdqu 32-128(%rdi),%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ vmovdqu 96-128(%rdi),%xmm11
+ vmovdqu 128-128(%rdi),%xmm12
+ vmovdqu 160-128(%rdi),%xmm13
+ vmovdqu 192-128(%rdi),%xmm14
+ vmovdqu 224-128(%rdi),%xmm15
+ vmovdqu .Lpbswap(%rip),%xmm6
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vpxor %xmm9,%xmm10,%xmm4
+ vmovd 0(%r8),%xmm5
+ vmovd 0(%r9),%xmm0
+ vpinsrd $1,0(%r10),%xmm5,%xmm5
+ vpinsrd $1,0(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,0-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovd 4(%r8),%xmm5
+ vmovd 4(%r9),%xmm0
+ vpinsrd $1,4(%r10),%xmm5,%xmm5
+ vpinsrd $1,4(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm5,16-128(%rax)
+ vpaddd %xmm14,%xmm5,%xmm5
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm5,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovd 8(%r8),%xmm5
+ vmovd 8(%r9),%xmm0
+ vpinsrd $1,8(%r10),%xmm5,%xmm5
+ vpinsrd $1,8(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,32-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovd 12(%r8),%xmm5
+ vmovd 12(%r9),%xmm0
+ vpinsrd $1,12(%r10),%xmm5,%xmm5
+ vpinsrd $1,12(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm5,48-128(%rax)
+ vpaddd %xmm12,%xmm5,%xmm5
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+