aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJung-uk Kim <jkim@FreeBSD.org>2020-08-26 16:56:44 +0000
committerJung-uk Kim <jkim@FreeBSD.org>2020-08-26 16:56:44 +0000
commit3971092e119dd117e9e40f6b5955f54a2762dcf3 (patch)
tree0bba9eb1e9bd17761c4e9bec210a13af4cbcca35
parent63c1bb51629b1bdb150885c72bd297ff7d7f228a (diff)
downloadsrc-3971092e119dd117e9e40f6b5955f54a2762dcf3.tar.gz
src-3971092e119dd117e9e40f6b5955f54a2762dcf3.zip
Regen X86 assembly files after r364822.
Notes
Notes: svn path=/head/; revision=364823
-rw-r--r--secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S784
-rw-r--r--secure/lib/libcrypto/amd64/aesni-mb-x86_64.S965
-rw-r--r--secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S1350
-rw-r--r--secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S4376
-rw-r--r--secure/lib/libcrypto/amd64/chacha-x86_64.S1026
-rw-r--r--secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S2055
-rw-r--r--secure/lib/libcrypto/amd64/ghash-x86_64.S475
-rw-r--r--secure/lib/libcrypto/amd64/poly1305-x86_64.S1785
-rw-r--r--secure/lib/libcrypto/amd64/rsaz-avx2.S1749
-rw-r--r--secure/lib/libcrypto/amd64/rsaz-x86_64.S664
-rw-r--r--secure/lib/libcrypto/amd64/sha1-mb-x86_64.S4315
-rw-r--r--secure/lib/libcrypto/amd64/sha1-x86_64.S2829
-rw-r--r--secure/lib/libcrypto/amd64/sha256-mb-x86_64.S4672
-rw-r--r--secure/lib/libcrypto/amd64/sha256-x86_64.S2369
-rw-r--r--secure/lib/libcrypto/amd64/sha512-x86_64.S3660
-rw-r--r--secure/lib/libcrypto/amd64/x25519-x86_64.S390
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont.S380
-rw-r--r--secure/lib/libcrypto/amd64/x86_64-mont5.S1365
-rw-r--r--secure/lib/libcrypto/i386/chacha-x86.S960
-rw-r--r--secure/lib/libcrypto/i386/poly1305-x86.S1110
-rw-r--r--secure/lib/libcrypto/i386/sha1-586.S2350
-rw-r--r--secure/lib/libcrypto/i386/sha256-586.S4496
22 files changed, 44039 insertions, 86 deletions
diff --git a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
index 1cdcc86043b2..26e49f9b2979 100644
--- a/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S
@@ -2,20 +2,790 @@
/* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */
.text
-.globl aesni_gcm_encrypt
-.type aesni_gcm_encrypt,@function
-aesni_gcm_encrypt:
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
.cfi_startproc
- xorl %eax,%eax
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
.byte 0xf3,0xc3
.cfi_endproc
-.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
-
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,@function
+.align 32
aesni_gcm_decrypt:
.cfi_startproc
- xorl %eax,%eax
+ xorq %r10,%r10
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+ leaq -192(%rdi,%rdx,1),%r15
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+.cfi_startproc
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.cfi_endproc
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+.cfi_startproc
+ xorq %r10,%r10
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+ leaq -192(%rsi,%rdx,1),%r15
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
diff --git a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
index de4bac9488f7..706c5c59d38d 100644
--- a/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S
@@ -9,6 +9,14 @@
.align 32
aesni_multi_cbc_encrypt:
.cfi_startproc
+ cmpl $2,%edx
+ jb .Lenc_non_avx
+ movl OPENSSL_ia32cap_P+4(%rip),%ecx
+ testl $268435456,%ecx
+ jnz _avx_cbc_enc_shortcut
+ jmp .Lenc_non_avx
+.align 16
+.Lenc_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -283,6 +291,14 @@ aesni_multi_cbc_encrypt:
.align 32
aesni_multi_cbc_decrypt:
.cfi_startproc
+ cmpl $2,%edx
+ jb .Ldec_non_avx
+ movl OPENSSL_ia32cap_P+4(%rip),%ecx
+ testl $268435456,%ecx
+ jnz _avx_cbc_dec_shortcut
+ jmp .Ldec_non_avx
+.align 16
+.Ldec_non_avx:
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -542,3 +558,952 @@ aesni_multi_cbc_decrypt:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
+.type aesni_multi_cbc_encrypt_avx,@function
+.align 32
+aesni_multi_cbc_encrypt_avx:
+.cfi_startproc
+_avx_cbc_enc_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+
+
+ subq $192,%rsp
+ andq $-128,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Lenc8x_body:
+ vzeroupper
+ vmovdqu (%rsi),%xmm15
+ leaq 120(%rsi),%rsi
+ leaq 160(%rdi),%rdi
+ shrl $1,%edx
+
+.Lenc8x_loop_grande:
+
+ xorl %edx,%edx
+ movl -144(%rdi),%ecx
+ movq -160(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -152(%rdi),%rbx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -136(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ subq %r8,%rbx
+ movq %rbx,64(%rsp)
+ movl -104(%rdi),%ecx
+ movq -120(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -112(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -96(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ subq %r9,%rbp
+ movq %rbp,72(%rsp)
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r10
+ cmpl %edx,%ecx
+ movq -72(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -56(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ subq %r10,%rbp
+ movq %rbp,80(%rsp)
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq -32(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -16(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ subq %r11,%rbp
+ movq %rbp,88(%rsp)
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r12
+ cmpl %edx,%ecx
+ movq 8(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 24(%rdi),%xmm6
+ movl %ecx,48(%rsp)
+ cmovleq %rsp,%r12
+ subq %r12,%rbp
+ movq %rbp,96(%rsp)
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r13
+ cmpl %edx,%ecx
+ movq 48(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 64(%rdi),%xmm7
+ movl %ecx,52(%rsp)
+ cmovleq %rsp,%r13
+ subq %r13,%rbp
+ movq %rbp,104(%rsp)
+ movl 96(%rdi),%ecx
+ movq 80(%rdi),%r14
+ cmpl %edx,%ecx
+ movq 88(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 104(%rdi),%xmm8
+ movl %ecx,56(%rsp)
+ cmovleq %rsp,%r14
+ subq %r14,%rbp
+ movq %rbp,112(%rsp)
+ movl 136(%rdi),%ecx
+ movq 120(%rdi),%r15
+ cmpl %edx,%ecx
+ movq 128(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 144(%rdi),%xmm9
+ movl %ecx,60(%rsp)
+ cmovleq %rsp,%r15
+ subq %r15,%rbp
+ movq %rbp,120(%rsp)
+ testl %edx,%edx
+ jz .Lenc8x_done
+
+ vmovups 16-120(%rsi),%xmm1
+ vmovups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+
+ vpxor (%r8),%xmm15,%xmm10
+ leaq 128(%rsp),%rbp
+ vpxor (%r9),%xmm15,%xmm11
+ vpxor (%r10),%xmm15,%xmm12
+ vpxor (%r11),%xmm15,%xmm13
+ vpxor %xmm10,%xmm2,%xmm2
+ vpxor (%r12),%xmm15,%xmm10
+ vpxor %xmm11,%xmm3,%xmm3
+ vpxor (%r13),%xmm15,%xmm11
+ vpxor %xmm12,%xmm4,%xmm4
+ vpxor (%r14),%xmm15,%xmm12
+ vpxor %xmm13,%xmm5,%xmm5
+ vpxor (%r15),%xmm15,%xmm13
+ vpxor %xmm10,%xmm6,%xmm6
+ movl $1,%ecx
+ vpxor %xmm11,%xmm7,%xmm7
+ vpxor %xmm12,%xmm8,%xmm8
+ vpxor %xmm13,%xmm9,%xmm9
+ jmp .Loop_enc8x
+
+.align 32
+.Loop_enc8x:
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+0(%rsp),%ecx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r8)
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r8,%rbx,1),%rbx
+ cmovgeq %rsp,%r8
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r8,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r8),%xmm15,%xmm10
+ movq %rbx,64+0(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -72(%rsi),%xmm1
+ leaq 16(%r8,%rbx,1),%r8
+ vmovdqu %xmm10,0(%rbp)
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+4(%rsp),%ecx
+ movq 64+8(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r9)
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r9,%rbx,1),%rbx
+ cmovgeq %rsp,%r9
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r9,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r9),%xmm15,%xmm11
+ movq %rbx,64+8(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups -56(%rsi),%xmm0
+ leaq 16(%r9,%rbx,1),%r9
+ vmovdqu %xmm11,16(%rbp)
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+8(%rsp),%ecx
+ movq 64+16(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r10)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r8)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r10,%rbx,1),%rbx
+ cmovgeq %rsp,%r10
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r10,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r10),%xmm15,%xmm12
+ movq %rbx,64+16(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -40(%rsi),%xmm1
+ leaq 16(%r10,%rbx,1),%r10
+ vmovdqu %xmm12,32(%rbp)
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+12(%rsp),%ecx
+ movq 64+24(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r11)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r9)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r11,%rbx,1),%rbx
+ cmovgeq %rsp,%r11
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r11,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r11),%xmm15,%xmm13
+ movq %rbx,64+24(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups -24(%rsi),%xmm0
+ leaq 16(%r11,%rbx,1),%r11
+ vmovdqu %xmm13,48(%rbp)
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+16(%rsp),%ecx
+ movq 64+32(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r12)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r10)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r12,%rbx,1),%rbx
+ cmovgeq %rsp,%r12
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r12,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r12),%xmm15,%xmm10
+ movq %rbx,64+32(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups -8(%rsi),%xmm1
+ leaq 16(%r12,%rbx,1),%r12
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+20(%rsp),%ecx
+ movq 64+40(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r13)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r11)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%rbx,%r13,1),%rbx
+ cmovgeq %rsp,%r13
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r13,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r13),%xmm15,%xmm11
+ movq %rbx,64+40(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 8(%rsi),%xmm0
+ leaq 16(%r13,%rbx,1),%r13
+ vaesenc %xmm1,%xmm2,%xmm2
+ cmpl 32+24(%rsp),%ecx
+ movq 64+48(%rsp),%rbx
+ vaesenc %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r14)
+ vaesenc %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r12)
+ vaesenc %xmm1,%xmm5,%xmm5
+ leaq (%r14,%rbx,1),%rbx
+ cmovgeq %rsp,%r14
+ vaesenc %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm1,%xmm7,%xmm7
+ subq %r14,%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vpxor 16(%r14),%xmm15,%xmm12
+ movq %rbx,64+48(%rsp)
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 24(%rsi),%xmm1
+ leaq 16(%r14,%rbx,1),%r14
+ vaesenc %xmm0,%xmm2,%xmm2
+ cmpl 32+28(%rsp),%ecx
+ movq 64+56(%rsp),%rbx
+ vaesenc %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r15)
+ vaesenc %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r13)
+ vaesenc %xmm0,%xmm5,%xmm5
+ leaq (%r15,%rbx,1),%rbx
+ cmovgeq %rsp,%r15
+ vaesenc %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesenc %xmm0,%xmm7,%xmm7
+ subq %r15,%rbx
+ vaesenc %xmm0,%xmm8,%xmm8
+ vpxor 16(%r15),%xmm15,%xmm13
+ movq %rbx,64+56(%rsp)
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 40(%rsi),%xmm0
+ leaq 16(%r15,%rbx,1),%r15
+ vmovdqu 32(%rsp),%xmm14
+ prefetcht0 15(%r14)
+ prefetcht0 15(%r15)
+ cmpl $11,%eax
+ jb .Lenc8x_tail
+
+ vaesenc %xmm1,%xmm2,%xmm2
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 176-120(%rsi),%xmm1
+
+ vaesenc %xmm0,%xmm2,%xmm2
+ vaesenc %xmm0,%xmm3,%xmm3
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ vaesenc %xmm0,%xmm6,%xmm6
+ vaesenc %xmm0,%xmm7,%xmm7
+ vaesenc %xmm0,%xmm8,%xmm8
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 192-120(%rsi),%xmm0
+ je .Lenc8x_tail
+
+ vaesenc %xmm1,%xmm2,%xmm2
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 208-120(%rsi),%xmm1
+
+ vaesenc %xmm0,%xmm2,%xmm2
+ vaesenc %xmm0,%xmm3,%xmm3
+ vaesenc %xmm0,%xmm4,%xmm4
+ vaesenc %xmm0,%xmm5,%xmm5
+ vaesenc %xmm0,%xmm6,%xmm6
+ vaesenc %xmm0,%xmm7,%xmm7
+ vaesenc %xmm0,%xmm8,%xmm8
+ vaesenc %xmm0,%xmm9,%xmm9
+ vmovups 224-120(%rsi),%xmm0
+
+.Lenc8x_tail:
+ vaesenc %xmm1,%xmm2,%xmm2
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesenc %xmm1,%xmm3,%xmm3
+ vaesenc %xmm1,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesenc %xmm1,%xmm5,%xmm5
+ vaesenc %xmm1,%xmm6,%xmm6
+ vpaddd %xmm14,%xmm15,%xmm15
+ vmovdqu 48(%rsp),%xmm14
+ vaesenc %xmm1,%xmm7,%xmm7
+ movq 64(%rsp),%rbx
+ vaesenc %xmm1,%xmm8,%xmm8
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 16-120(%rsi),%xmm1
+
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vmovdqa %xmm15,32(%rsp)
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesenclast %xmm0,%xmm3,%xmm3
+ vaesenclast %xmm0,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesenclast %xmm0,%xmm5,%xmm5
+ vaesenclast %xmm0,%xmm6,%xmm6
+ vpaddd %xmm15,%xmm14,%xmm14
+ vmovdqu -120(%rsi),%xmm15
+ vaesenclast %xmm0,%xmm7,%xmm7
+ vaesenclast %xmm0,%xmm8,%xmm8
+ vmovdqa %xmm14,48(%rsp)
+ vaesenclast %xmm0,%xmm9,%xmm9
+ vmovups 32-120(%rsi),%xmm0
+
+ vmovups %xmm2,-16(%r8)
+ subq %rbx,%r8
+ vpxor 0(%rbp),%xmm2,%xmm2
+ vmovups %xmm3,-16(%r9)
+ subq 72(%rsp),%r9
+ vpxor 16(%rbp),%xmm3,%xmm3
+ vmovups %xmm4,-16(%r10)
+ subq 80(%rsp),%r10
+ vpxor 32(%rbp),%xmm4,%xmm4
+ vmovups %xmm5,-16(%r11)
+ subq 88(%rsp),%r11
+ vpxor 48(%rbp),%xmm5,%xmm5
+ vmovups %xmm6,-16(%r12)
+ subq 96(%rsp),%r12
+ vpxor %xmm10,%xmm6,%xmm6
+ vmovups %xmm7,-16(%r13)
+ subq 104(%rsp),%r13
+ vpxor %xmm11,%xmm7,%xmm7
+ vmovups %xmm8,-16(%r14)
+ subq 112(%rsp),%r14
+ vpxor %xmm12,%xmm8,%xmm8
+ vmovups %xmm9,-16(%r15)
+ subq 120(%rsp),%r15
+ vpxor %xmm13,%xmm9,%xmm9
+
+ decl %edx
+ jnz .Loop_enc8x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+
+
+
+
+
+.Lenc8x_done:
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lenc8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
+
+.type aesni_multi_cbc_decrypt_avx,@function
+.align 32
+aesni_multi_cbc_decrypt_avx:
+.cfi_startproc
+_avx_cbc_dec_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+
+
+
+
+
+
+
+
+ subq $256,%rsp
+ andq $-256,%rsp
+ subq $192,%rsp
+ movq %rax,16(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08
+
+.Ldec8x_body:
+ vzeroupper
+ vmovdqu (%rsi),%xmm15
+ leaq 120(%rsi),%rsi
+ leaq 160(%rdi),%rdi
+ shrl $1,%edx
+
+.Ldec8x_loop_grande:
+
+ xorl %edx,%edx
+ movl -144(%rdi),%ecx
+ movq -160(%rdi),%r8
+ cmpl %edx,%ecx
+ movq -152(%rdi),%rbx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -136(%rdi),%xmm2
+ movl %ecx,32(%rsp)
+ cmovleq %rsp,%r8
+ subq %r8,%rbx
+ movq %rbx,64(%rsp)
+ vmovdqu %xmm2,192(%rsp)
+ movl -104(%rdi),%ecx
+ movq -120(%rdi),%r9
+ cmpl %edx,%ecx
+ movq -112(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -96(%rdi),%xmm3
+ movl %ecx,36(%rsp)
+ cmovleq %rsp,%r9
+ subq %r9,%rbp
+ movq %rbp,72(%rsp)
+ vmovdqu %xmm3,208(%rsp)
+ movl -64(%rdi),%ecx
+ movq -80(%rdi),%r10
+ cmpl %edx,%ecx
+ movq -72(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -56(%rdi),%xmm4
+ movl %ecx,40(%rsp)
+ cmovleq %rsp,%r10
+ subq %r10,%rbp
+ movq %rbp,80(%rsp)
+ vmovdqu %xmm4,224(%rsp)
+ movl -24(%rdi),%ecx
+ movq -40(%rdi),%r11
+ cmpl %edx,%ecx
+ movq -32(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu -16(%rdi),%xmm5
+ movl %ecx,44(%rsp)
+ cmovleq %rsp,%r11
+ subq %r11,%rbp
+ movq %rbp,88(%rsp)
+ vmovdqu %xmm5,240(%rsp)
+ movl 16(%rdi),%ecx
+ movq 0(%rdi),%r12
+ cmpl %edx,%ecx
+ movq 8(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 24(%rdi),%xmm6
+ movl %ecx,48(%rsp)
+ cmovleq %rsp,%r12
+ subq %r12,%rbp
+ movq %rbp,96(%rsp)
+ vmovdqu %xmm6,256(%rsp)
+ movl 56(%rdi),%ecx
+ movq 40(%rdi),%r13
+ cmpl %edx,%ecx
+ movq 48(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 64(%rdi),%xmm7
+ movl %ecx,52(%rsp)
+ cmovleq %rsp,%r13
+ subq %r13,%rbp
+ movq %rbp,104(%rsp)
+ vmovdqu %xmm7,272(%rsp)
+ movl 96(%rdi),%ecx
+ movq 80(%rdi),%r14
+ cmpl %edx,%ecx
+ movq 88(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 104(%rdi),%xmm8
+ movl %ecx,56(%rsp)
+ cmovleq %rsp,%r14
+ subq %r14,%rbp
+ movq %rbp,112(%rsp)
+ vmovdqu %xmm8,288(%rsp)
+ movl 136(%rdi),%ecx
+ movq 120(%rdi),%r15
+ cmpl %edx,%ecx
+ movq 128(%rdi),%rbp
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ vmovdqu 144(%rdi),%xmm9
+ movl %ecx,60(%rsp)
+ cmovleq %rsp,%r15
+ subq %r15,%rbp
+ movq %rbp,120(%rsp)
+ vmovdqu %xmm9,304(%rsp)
+ testl %edx,%edx
+ jz .Ldec8x_done
+
+ vmovups 16-120(%rsi),%xmm1
+ vmovups 32-120(%rsi),%xmm0
+ movl 240-120(%rsi),%eax
+ leaq 192+128(%rsp),%rbp
+
+ vmovdqu (%r8),%xmm2
+ vmovdqu (%r9),%xmm3
+ vmovdqu (%r10),%xmm4
+ vmovdqu (%r11),%xmm5
+ vmovdqu (%r12),%xmm6
+ vmovdqu (%r13),%xmm7
+ vmovdqu (%r14),%xmm8
+ vmovdqu (%r15),%xmm9
+ vmovdqu %xmm2,0(%rbp)
+ vpxor %xmm15,%xmm2,%xmm2
+ vmovdqu %xmm3,16(%rbp)
+ vpxor %xmm15,%xmm3,%xmm3
+ vmovdqu %xmm4,32(%rbp)
+ vpxor %xmm15,%xmm4,%xmm4
+ vmovdqu %xmm5,48(%rbp)
+ vpxor %xmm15,%xmm5,%xmm5
+ vmovdqu %xmm6,64(%rbp)
+ vpxor %xmm15,%xmm6,%xmm6
+ vmovdqu %xmm7,80(%rbp)
+ vpxor %xmm15,%xmm7,%xmm7
+ vmovdqu %xmm8,96(%rbp)
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu %xmm9,112(%rbp)
+ vpxor %xmm15,%xmm9,%xmm9
+ xorq $0x80,%rbp
+ movl $1,%ecx
+ jmp .Loop_dec8x
+
+.align 32
+.Loop_dec8x:
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+0(%rsp),%ecx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r8)
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r8,%rbx,1),%rbx
+ cmovgeq %rsp,%r8
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r8,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r8),%xmm10
+ movq %rbx,64+0(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -72(%rsi),%xmm1
+ leaq 16(%r8,%rbx,1),%r8
+ vmovdqu %xmm10,128(%rsp)
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+4(%rsp),%ecx
+ movq 64+8(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r9)
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r9,%rbx,1),%rbx
+ cmovgeq %rsp,%r9
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r9,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r9),%xmm11
+ movq %rbx,64+8(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups -56(%rsi),%xmm0
+ leaq 16(%r9,%rbx,1),%r9
+ vmovdqu %xmm11,144(%rsp)
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+8(%rsp),%ecx
+ movq 64+16(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r10)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r8)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r10,%rbx,1),%rbx
+ cmovgeq %rsp,%r10
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r10,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r10),%xmm12
+ movq %rbx,64+16(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -40(%rsi),%xmm1
+ leaq 16(%r10,%rbx,1),%r10
+ vmovdqu %xmm12,160(%rsp)
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+12(%rsp),%ecx
+ movq 64+24(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r11)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r9)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r11,%rbx,1),%rbx
+ cmovgeq %rsp,%r11
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r11,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r11),%xmm13
+ movq %rbx,64+24(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups -24(%rsi),%xmm0
+ leaq 16(%r11,%rbx,1),%r11
+ vmovdqu %xmm13,176(%rsp)
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+16(%rsp),%ecx
+ movq 64+32(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r12)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r10)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r12,%rbx,1),%rbx
+ cmovgeq %rsp,%r12
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r12,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r12),%xmm10
+ movq %rbx,64+32(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups -8(%rsi),%xmm1
+ leaq 16(%r12,%rbx,1),%r12
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+20(%rsp),%ecx
+ movq 64+40(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r13)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r11)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%rbx,%r13,1),%rbx
+ cmovgeq %rsp,%r13
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r13,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r13),%xmm11
+ movq %rbx,64+40(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 8(%rsi),%xmm0
+ leaq 16(%r13,%rbx,1),%r13
+ vaesdec %xmm1,%xmm2,%xmm2
+ cmpl 32+24(%rsp),%ecx
+ movq 64+48(%rsp),%rbx
+ vaesdec %xmm1,%xmm3,%xmm3
+ prefetcht0 31(%r14)
+ vaesdec %xmm1,%xmm4,%xmm4
+ prefetcht0 15(%r12)
+ vaesdec %xmm1,%xmm5,%xmm5
+ leaq (%r14,%rbx,1),%rbx
+ cmovgeq %rsp,%r14
+ vaesdec %xmm1,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm1,%xmm7,%xmm7
+ subq %r14,%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vmovdqu 16(%r14),%xmm12
+ movq %rbx,64+48(%rsp)
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 24(%rsi),%xmm1
+ leaq 16(%r14,%rbx,1),%r14
+ vaesdec %xmm0,%xmm2,%xmm2
+ cmpl 32+28(%rsp),%ecx
+ movq 64+56(%rsp),%rbx
+ vaesdec %xmm0,%xmm3,%xmm3
+ prefetcht0 31(%r15)
+ vaesdec %xmm0,%xmm4,%xmm4
+ prefetcht0 15(%r13)
+ vaesdec %xmm0,%xmm5,%xmm5
+ leaq (%r15,%rbx,1),%rbx
+ cmovgeq %rsp,%r15
+ vaesdec %xmm0,%xmm6,%xmm6
+ cmovgq %rsp,%rbx
+ vaesdec %xmm0,%xmm7,%xmm7
+ subq %r15,%rbx
+ vaesdec %xmm0,%xmm8,%xmm8
+ vmovdqu 16(%r15),%xmm13
+ movq %rbx,64+56(%rsp)
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 40(%rsi),%xmm0
+ leaq 16(%r15,%rbx,1),%r15
+ vmovdqu 32(%rsp),%xmm14
+ prefetcht0 15(%r14)
+ prefetcht0 15(%r15)
+ cmpl $11,%eax
+ jb .Ldec8x_tail
+
+ vaesdec %xmm1,%xmm2,%xmm2
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vaesdec %xmm1,%xmm7,%xmm7
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 176-120(%rsi),%xmm1
+
+ vaesdec %xmm0,%xmm2,%xmm2
+ vaesdec %xmm0,%xmm3,%xmm3
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ vaesdec %xmm0,%xmm6,%xmm6
+ vaesdec %xmm0,%xmm7,%xmm7
+ vaesdec %xmm0,%xmm8,%xmm8
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 192-120(%rsi),%xmm0
+ je .Ldec8x_tail
+
+ vaesdec %xmm1,%xmm2,%xmm2
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vaesdec %xmm1,%xmm7,%xmm7
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 208-120(%rsi),%xmm1
+
+ vaesdec %xmm0,%xmm2,%xmm2
+ vaesdec %xmm0,%xmm3,%xmm3
+ vaesdec %xmm0,%xmm4,%xmm4
+ vaesdec %xmm0,%xmm5,%xmm5
+ vaesdec %xmm0,%xmm6,%xmm6
+ vaesdec %xmm0,%xmm7,%xmm7
+ vaesdec %xmm0,%xmm8,%xmm8
+ vaesdec %xmm0,%xmm9,%xmm9
+ vmovups 224-120(%rsi),%xmm0
+
+.Ldec8x_tail:
+ vaesdec %xmm1,%xmm2,%xmm2
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesdec %xmm1,%xmm3,%xmm3
+ vaesdec %xmm1,%xmm4,%xmm4
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesdec %xmm1,%xmm5,%xmm5
+ vaesdec %xmm1,%xmm6,%xmm6
+ vpaddd %xmm14,%xmm15,%xmm15
+ vmovdqu 48(%rsp),%xmm14
+ vaesdec %xmm1,%xmm7,%xmm7
+ movq 64(%rsp),%rbx
+ vaesdec %xmm1,%xmm8,%xmm8
+ vaesdec %xmm1,%xmm9,%xmm9
+ vmovups 16-120(%rsi),%xmm1
+
+ vaesdeclast %xmm0,%xmm2,%xmm2
+ vmovdqa %xmm15,32(%rsp)
+ vpxor %xmm15,%xmm15,%xmm15
+ vaesdeclast %xmm0,%xmm3,%xmm3
+ vpxor 0(%rbp),%xmm2,%xmm2
+ vaesdeclast %xmm0,%xmm4,%xmm4
+ vpxor 16(%rbp),%xmm3,%xmm3
+ vpcmpgtd %xmm15,%xmm14,%xmm15
+ vaesdeclast %xmm0,%xmm5,%xmm5
+ vpxor 32(%rbp),%xmm4,%xmm4
+ vaesdeclast %xmm0,%xmm6,%xmm6
+ vpxor 48(%rbp),%xmm5,%xmm5
+ vpaddd %xmm15,%xmm14,%xmm14
+ vmovdqu -120(%rsi),%xmm15
+ vaesdeclast %xmm0,%xmm7,%xmm7
+ vpxor 64(%rbp),%xmm6,%xmm6
+ vaesdeclast %xmm0,%xmm8,%xmm8
+ vpxor 80(%rbp),%xmm7,%xmm7
+ vmovdqa %xmm14,48(%rsp)
+ vaesdeclast %xmm0,%xmm9,%xmm9
+ vpxor 96(%rbp),%xmm8,%xmm8
+ vmovups 32-120(%rsi),%xmm0
+
+ vmovups %xmm2,-16(%r8)
+ subq %rbx,%r8
+ vmovdqu 128+0(%rsp),%xmm2
+ vpxor 112(%rbp),%xmm9,%xmm9
+ vmovups %xmm3,-16(%r9)
+ subq 72(%rsp),%r9
+ vmovdqu %xmm2,0(%rbp)
+ vpxor %xmm15,%xmm2,%xmm2
+ vmovdqu 128+16(%rsp),%xmm3
+ vmovups %xmm4,-16(%r10)
+ subq 80(%rsp),%r10
+ vmovdqu %xmm3,16(%rbp)
+ vpxor %xmm15,%xmm3,%xmm3
+ vmovdqu 128+32(%rsp),%xmm4
+ vmovups %xmm5,-16(%r11)
+ subq 88(%rsp),%r11
+ vmovdqu %xmm4,32(%rbp)
+ vpxor %xmm15,%xmm4,%xmm4
+ vmovdqu 128+48(%rsp),%xmm5
+ vmovups %xmm6,-16(%r12)
+ subq 96(%rsp),%r12
+ vmovdqu %xmm5,48(%rbp)
+ vpxor %xmm15,%xmm5,%xmm5
+ vmovdqu %xmm10,64(%rbp)
+ vpxor %xmm10,%xmm15,%xmm6
+ vmovups %xmm7,-16(%r13)
+ subq 104(%rsp),%r13
+ vmovdqu %xmm11,80(%rbp)
+ vpxor %xmm11,%xmm15,%xmm7
+ vmovups %xmm8,-16(%r14)
+ subq 112(%rsp),%r14
+ vmovdqu %xmm12,96(%rbp)
+ vpxor %xmm12,%xmm15,%xmm8
+ vmovups %xmm9,-16(%r15)
+ subq 120(%rsp),%r15
+ vmovdqu %xmm13,112(%rbp)
+ vpxor %xmm13,%xmm15,%xmm9
+
+ xorq $128,%rbp
+ decl %edx
+ jnz .Loop_dec8x
+
+ movq 16(%rsp),%rax
+.cfi_def_cfa %rax,8
+
+
+
+
+
+.Ldec8x_done:
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Ldec8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
diff --git a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
index 294db310a06a..38f306142c82 100644
--- a/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S
@@ -13,6 +13,11 @@ aesni_cbc_sha1_enc:
movq OPENSSL_ia32cap_P+4(%rip),%r11
btq $61,%r11
jc aesni_cbc_sha1_enc_shaext
+ andl $268435456,%r11d
+ andl $1073741824,%r10d
+ orl %r11d,%r10d
+ cmpl $1342177280,%r10d
+ je aesni_cbc_sha1_enc_avx
jmp aesni_cbc_sha1_enc_ssse3
.byte 0xf3,0xc3
.cfi_endproc
@@ -1394,6 +1399,1327 @@ aesni_cbc_sha1_enc_ssse3:
.byte 0xf3,0xc3
.cfi_endproc
.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+.type aesni_cbc_sha1_enc_avx,@function
+.align 32
+aesni_cbc_sha1_enc_avx:
+.cfi_startproc
+ movq 8(%rsp),%r10
+
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ leaq -104(%rsp),%rsp
+.cfi_adjust_cfa_offset 104
+
+
+ vzeroall
+ movq %rdi,%r12
+ movq %rsi,%r13
+ movq %rdx,%r14
+ leaq 112(%rcx),%r15
+ vmovdqu (%r8),%xmm12
+ movq %r8,88(%rsp)
+ shlq $6,%r14
+ subq %r12,%r13
+ movl 240-112(%r15),%r8d
+ addq %r10,%r14
+
+ leaq K_XX_XX(%rip),%r11
+ movl 0(%r9),%eax
+ movl 4(%r9),%ebx
+ movl 8(%r9),%ecx
+ movl 12(%r9),%edx
+ movl %ebx,%esi
+ movl 16(%r9),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r11),%xmm6
+ vmovdqa 0(%r11),%xmm10
+ vmovdqu 0(%r10),%xmm0
+ vmovdqu 16(%r10),%xmm1
+ vmovdqu 32(%r10),%xmm2
+ vmovdqu 48(%r10),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r10
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm10,%xmm0,%xmm4
+ vpaddd %xmm10,%xmm1,%xmm5
+ vpaddd %xmm10,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ jmp .Loop_avx
+.align 32
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ vmovdqu 0(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm10,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm9
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpor %xmm8,%xmm4,%xmm4
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ vpxor %xmm9,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm10,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm9
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpor %xmm8,%xmm5,%xmm5
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm9,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa 16(%r11),%xmm10
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ vpaddd %xmm5,%xmm10,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm9
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpor %xmm8,%xmm6,%xmm6
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm9,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm10,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm9
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpor %xmm8,%xmm7,%xmm7
+ vpsrld $30,%xmm9,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ cmpl $11,%r8d
+ jb .Lvaesenclast6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast6:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm9,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm10,%xmm9
+ addl %esi,%edx
+ vmovdqu 16(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vmovups %xmm12,0(%r12,%r13,1)
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm10,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm10,%xmm9
+ vmovdqa 32(%r11),%xmm10
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm10,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm10,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm10,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ cmpl $11,%r8d
+ jb .Lvaesenclast7
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast7
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast7:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ vmovdqu 32(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vmovups %xmm12,16(%r13,%r12,1)
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm10,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm10,%xmm9
+ vmovdqa 48(%r11),%xmm10
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm10,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm10,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ cmpl $11,%r8d
+ jb .Lvaesenclast8
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast8
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast8:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm10,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vmovdqu 48(%r12),%xmm13
+ vpxor %xmm15,%xmm13,%xmm13
+ vmovups %xmm12,32(%r13,%r12,1)
+ vpxor %xmm13,%xmm12,%xmm12
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -80(%r15),%xmm15
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -64(%r15),%xmm14
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -48(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm10,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups -32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm10,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups -16(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 0(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r14,%r10
+ je .Ldone_avx
+ vmovdqa 64(%r11),%xmm9
+ vmovdqa 0(%r11),%xmm10
+ vmovdqu 0(%r10),%xmm0
+ vmovdqu 16(%r10),%xmm1
+ vmovdqu 32(%r10),%xmm2
+ vmovdqu 48(%r10),%xmm3
+ vpshufb %xmm9,%xmm0,%xmm0
+ addq $64,%r10
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm9,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm10,%xmm0,%xmm8
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm8,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm9,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm10,%xmm1,%xmm8
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm8,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm9,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm10,%xmm2,%xmm8
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm8,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ cmpl $11,%r8d
+ jb .Lvaesenclast9
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast9
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast9:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vmovups %xmm12,48(%r13,%r12,1)
+ leaq 64(%r12),%r12
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ addl 12(%r9),%edx
+ movl %eax,0(%r9)
+ addl 16(%r9),%ebp
+ movl %esi,4(%r9)
+ movl %esi,%ebx
+ movl %ecx,8(%r9)
+ movl %ecx,%edi
+ movl %edx,12(%r9)
+ xorl %edx,%edi
+ movl %ebp,16(%r9)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 16(%r15),%xmm15
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 32(%r15),%xmm14
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 48(%r15),%xmm15
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ cmpl $11,%r8d
+ jb .Lvaesenclast10
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 64(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 80(%r15),%xmm15
+ je .Lvaesenclast10
+ vaesenc %xmm15,%xmm12,%xmm12
+ vmovups 96(%r15),%xmm14
+ vaesenc %xmm14,%xmm12,%xmm12
+ vmovups 112(%r15),%xmm15
+.Lvaesenclast10:
+ vaesenclast %xmm15,%xmm12,%xmm12
+ vmovups -112(%r15),%xmm15
+ vmovups 16-112(%r15),%xmm14
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vmovups %xmm12,48(%r13,%r12,1)
+ movq 88(%rsp),%r8
+
+ addl 0(%r9),%eax
+ addl 4(%r9),%esi
+ addl 8(%r9),%ecx
+ movl %eax,0(%r9)
+ addl 12(%r9),%edx
+ movl %esi,4(%r9)
+ addl 16(%r9),%ebp
+ movl %ecx,8(%r9)
+ movl %edx,12(%r9)
+ movl %ebp,16(%r9)
+ vmovups %xmm12,(%r8)
+ vzeroall
+ leaq 104(%rsp),%rsi
+.cfi_def_cfa %rsi,56
+ movq 0(%rsi),%r15
+.cfi_restore %r15
+ movq 8(%rsi),%r14
+.cfi_restore %r14
+ movq 16(%rsi),%r13
+.cfi_restore %r13
+ movq 24(%rsi),%r12
+.cfi_restore %r12
+ movq 32(%rsi),%rbp
+.cfi_restore %rbp
+ movq 40(%rsi),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsi),%rsp
+.cfi_def_cfa %rsp,8
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
@@ -1485,17 +2811,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm3,%xmm5
.byte 15,56,201,243
cmpl $11,%r11d
- jb .Laesenclast6
+ jb .Laesenclast11
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast6
+ je .Laesenclast11
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast6:
+.Laesenclast11:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -1551,17 +2877,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm4,%xmm6
.byte 15,56,201,220
cmpl $11,%r11d
- jb .Laesenclast7
+ jb .Laesenclast12
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast7
+ je .Laesenclast12
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast7:
+.Laesenclast12:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm9
@@ -1617,17 +2943,17 @@ aesni_cbc_sha1_enc_shaext:
pxor %xmm5,%xmm3
.byte 15,56,201,229
cmpl $11,%r11d
- jb .Laesenclast8
+ jb .Laesenclast13
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast8
+ je .Laesenclast13
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast8:
+.Laesenclast13:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
movdqa %xmm8,%xmm10
@@ -1681,17 +3007,17 @@ aesni_cbc_sha1_enc_shaext:
movups 48(%rcx),%xmm1
.byte 102,15,56,220,208
cmpl $11,%r11d
- jb .Laesenclast9
+ jb .Laesenclast14
movups 64(%rcx),%xmm0
.byte 102,15,56,220,209
movups 80(%rcx),%xmm1
.byte 102,15,56,220,208
- je .Laesenclast9
+ je .Laesenclast14
movups 96(%rcx),%xmm0
.byte 102,15,56,220,209
movups 112(%rcx),%xmm1
.byte 102,15,56,220,208
-.Laesenclast9:
+.Laesenclast14:
.byte 102,15,56,221,209
movups 16-112(%rcx),%xmm0
decq %rdx
diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
index e42a02ebe647..cb9e150db553 100644
--- a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
@@ -8,6 +8,25 @@
.align 16
aesni_cbc_sha256_enc:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl $1,%eax
+ cmpq $0,%rdi
+ je .Lprobe
+ movl 0(%r11),%eax
+ movq 4(%r11),%r10
+ btq $61,%r10
+ jc aesni_cbc_sha256_enc_shaext
+ movq %r10,%r11
+ shrq $32,%r11
+
+ testl $2048,%r10d
+ jnz aesni_cbc_sha256_enc_xop
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je aesni_cbc_sha256_enc_avx2
+ andl $268435456,%r10d
+ jnz aesni_cbc_sha256_enc_avx
+ ud2
xorl %eax,%eax
cmpq $0,%rdi
je .Lprobe
@@ -59,3 +78,4360 @@ K256:
.long 0,0,0,0, 0,0,0,0
.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
+.type aesni_cbc_sha256_enc_xop,@function
+.align 64
+aesni_cbc_sha256_enc_xop:
+.cfi_startproc
+.Lxop_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_xop:
+ vzeroall
+
+ movq %rdi,%r12
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r13
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ subq $9,%r14
+
+ movl 0(%r15),%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+
+ vmovdqa 0(%r13,%r14,8),%xmm14
+ vmovdqa 16(%r13,%r14,8),%xmm13
+ vmovdqa 32(%r13,%r14,8),%xmm12
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Lloop_xop
+.align 16
+.Lloop_xop:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi,%r12,1),%xmm0
+ vmovdqu 16(%rsi,%r12,1),%xmm1
+ vmovdqu 32(%rsi,%r12,1),%xmm2
+ vmovdqu 48(%rsi,%r12,1),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%esi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ subq $-32*4,%rbp
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%eax
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ vpaddd %xmm7,%xmm0,%xmm0
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+.byte 143,232,120,194,251,13
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ vpsrld $10,%xmm3,%xmm6
+ rorl $2,%r14d
+ addl %esi,%r11d
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %edx,%r13d
+ addl %r11d,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+.byte 143,232,120,194,248,13
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ vpsrld $10,%xmm0,%xmm6
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ vpaddd %xmm7,%xmm1,%xmm1
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+.byte 143,232,120,194,248,13
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ vpsrld $10,%xmm0,%xmm6
+ rorl $2,%r14d
+ addl %esi,%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %r11d,%r13d
+ addl %edx,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%edx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+.byte 143,232,120,194,249,13
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ vpsrld $10,%xmm1,%xmm6
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%eax
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ vpaddd %xmm7,%xmm2,%xmm2
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+.byte 143,232,120,194,249,13
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ vpsrld $10,%xmm1,%xmm6
+ rorl $2,%r14d
+ addl %esi,%r11d
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %edx,%r13d
+ addl %r11d,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+.byte 143,232,120,194,250,13
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ vpsrld $10,%xmm2,%xmm6
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ vpaddd %xmm7,%xmm3,%xmm3
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+.byte 143,232,120,194,250,13
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ vpsrld $10,%xmm2,%xmm6
+ rorl $2,%r14d
+ addl %esi,%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %r11d,%r13d
+ addl %edx,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%edx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+.byte 143,232,120,194,251,13
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ vpsrld $10,%xmm3,%xmm6
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ movq 64+0(%rsp),%r12
+ vpand %xmm14,%xmm11,%xmm11
+ movq 64+8(%rsp),%r15
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r12,1)
+ leaq 16(%r12),%r12
+ cmpb $0,131(%rbp)
+ jne .Lxop_00_47
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ rorl $2,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ rorl $2,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ rorl $2,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ rorl $2,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%r12
+ movq 64+8(%rsp),%r13
+ movq 64+40(%rsp),%r15
+ movq 64+48(%rsp),%rsi
+
+ vpand %xmm14,%xmm11,%xmm11
+ movl %r14d,%eax
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r12
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ jb .Lloop_xop
+
+ movq 64+32(%rsp),%r8
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_xop:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
+.type aesni_cbc_sha256_enc_avx,@function
+.align 64
+aesni_cbc_sha256_enc_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+ vzeroall
+
+ movq %rdi,%r12
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r13
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ subq $9,%r14
+
+ movl 0(%r15),%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+
+ vmovdqa 0(%r13,%r14,8),%xmm14
+ vmovdqa 16(%r13,%r14,8),%xmm13
+ vmovdqa 32(%r13,%r14,8),%xmm12
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi,%r12,1),%xmm0
+ vmovdqu 16(%rsi,%r12,1),%xmm1
+ vmovdqu 32(%rsi,%r12,1),%xmm2
+ vmovdqu 48(%rsi,%r12,1),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%esi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-32*4,%rbp
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ vpshufd $250,%xmm3,%xmm7
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpaddd %xmm6,%xmm0,%xmm0
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ vpshufd $80,%xmm0,%xmm7
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ vpshufd $250,%xmm0,%xmm7
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpaddd %xmm6,%xmm1,%xmm1
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ vpshufd $80,%xmm1,%xmm7
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ vpshufd $250,%xmm1,%xmm7
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpaddd %xmm6,%xmm2,%xmm2
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ vpshufd $80,%xmm2,%xmm7
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ vpshufd $250,%xmm2,%xmm7
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpaddd %xmm6,%xmm3,%xmm3
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ vpshufd $80,%xmm3,%xmm7
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ movq 64+0(%rsp),%r12
+ vpand %xmm14,%xmm11,%xmm11
+ movq 64+8(%rsp),%r15
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r12,1)
+ leaq 16(%r12),%r12
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%r12
+ movq 64+8(%rsp),%r13
+ movq 64+40(%rsp),%r15
+ movq 64+48(%rsp),%rsi
+
+ vpand %xmm14,%xmm11,%xmm11
+ movl %r14d,%eax
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r12
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+ jb .Lloop_avx
+
+ movq 64+32(%rsp),%r8
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
+.type aesni_cbc_sha256_enc_avx2,@function
+.align 64
+aesni_cbc_sha256_enc_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-1024,%rsp
+ addq $448,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+ vzeroall
+
+ movq %rdi,%r13
+ vpinsrq $1,%rsi,%xmm15,%xmm15
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r12
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ leaq -9(%r14),%r14
+
+ vmovdqa 0(%r12,%r14,8),%xmm14
+ vmovdqa 16(%r12,%r14,8),%xmm13
+ vmovdqa 32(%r12,%r14,8),%xmm12
+
+ subq $-64,%r13
+ movl 0(%r15),%eax
+ leaq (%rsi,%r13,1),%r12
+ movl 4(%r15),%ebx
+ cmpq %rdx,%r13
+ movl 8(%r15),%ecx
+ cmoveq %rsp,%r12
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa K256+512(%rip),%ymm7
+ vmovdqu -64+0(%rsi,%r13,1),%xmm0
+ vmovdqu -64+16(%rsi,%r13,1),%xmm1
+ vmovdqu -64+32(%rsi,%r13,1),%xmm2
+ vmovdqu -64+48(%rsi,%r13,1),%xmm3
+
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm7,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm7,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+
+ leaq K256(%rip),%rbp
+ vpshufb %ymm7,%ymm2,%ymm2
+ leaq -64(%r13),%r13
+ vpaddd 0(%rbp),%ymm0,%ymm4
+ vpshufb %ymm7,%ymm3,%ymm3
+ vpaddd 32(%rbp),%ymm1,%ymm5
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ vpaddd 96(%rbp),%ymm3,%ymm7
+ vmovdqa %ymm4,0(%rsp)
+ xorl %r14d,%r14d
+ vmovdqa %ymm5,32(%rsp)
+
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ leaq -64(%rsp),%rsp
+
+
+
+ movq %rsi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ movl %ebx,%esi
+ vmovdqa %ymm6,0(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %ymm7,32(%rsp)
+ movl %r9d,%r12d
+ subq $-32*4,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm0,%ymm1,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm2,%ymm3,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm0,%ymm0
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ vpshufd $250,%ymm3,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm0,%ymm0
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpshufd $80,%ymm0,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ vpaddd %ymm6,%ymm0,%ymm0
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ vpaddd 0(%rbp),%ymm0,%ymm6
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm1,%ymm2,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm3,%ymm0,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm1,%ymm1
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ vpshufd $250,%ymm0,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm1,%ymm1
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpshufd $80,%ymm1,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ vpaddd %ymm6,%ymm1,%ymm1
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ vpaddd 32(%rbp),%ymm1,%ymm6
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm2,%ymm3,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm0,%ymm1,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm2,%ymm2
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ vpshufd $250,%ymm1,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm2,%ymm2
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpshufd $80,%ymm2,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ vpaddd %ymm6,%ymm2,%ymm2
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm3,%ymm0,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm1,%ymm2,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm3,%ymm3
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ vpshufd $250,%ymm2,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm3,%ymm3
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpshufd $80,%ymm3,%ymm7
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ vpaddd %ymm6,%ymm3,%ymm3
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ vpaddd 96(%rbp),%ymm3,%ymm6
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ vmovq %xmm15,%r13
+ vpextrq $1,%xmm15,%r15
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r13,1)
+ leaq 16(%r13),%r13
+ leaq 128(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jne .Lavx2_00_47
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ addl 0+64(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+64(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+64(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+64(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+64(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+64(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+64(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+64(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ addl 0(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vpextrq $1,%xmm15,%r12
+ vmovq %xmm15,%r13
+ movq 552(%rsp),%r15
+ addl %r14d,%eax
+ leaq 448(%rsp),%rbp
+
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r13),%r13
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ cmpq 80(%rbp),%r13
+ je .Ldone_avx2
+
+ xorl %r14d,%r14d
+ movl %ebx,%esi
+ movl %r9d,%r12d
+ xorl %ecx,%esi
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ leaq -64(%rbp),%rbp
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovq %xmm15,%r13
+ vpextrq $1,%xmm15,%r15
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ leaq -64(%rbp),%rbp
+ vmovdqu %xmm8,(%r15,%r13,1)
+ leaq 16(%r13),%r13
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 552(%rsp),%r15
+ leaq 64(%r13),%r13
+ movq 560(%rsp),%rsi
+ addl %r14d,%eax
+ leaq 448(%rsp),%rsp
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ leaq (%rsi,%r13,1),%r12
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r13
+
+ movl %eax,0(%r15)
+ cmoveq %rsp,%r12
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+
+.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 64+32(%rbp),%r8
+ movq 64+56(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
+.type aesni_cbc_sha256_enc_shaext,@function
+.align 32
+aesni_cbc_sha256_enc_shaext:
+.cfi_startproc
+ movq 8(%rsp),%r10
+ leaq K256+128(%rip),%rax
+ movdqu (%r9),%xmm1
+ movdqu 16(%r9),%xmm2
+ movdqa 512-128(%rax),%xmm3
+
+ movl 240(%rcx),%r11d
+ subq %rdi,%rsi
+ movups (%rcx),%xmm15
+ movups (%r8),%xmm6
+ movups 16(%rcx),%xmm4
+ leaq 112(%rcx),%rcx
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%r10),%xmm10
+ movdqu 16(%r10),%xmm11
+ movdqu 32(%r10),%xmm12
+.byte 102,68,15,56,0,211
+ movdqu 48(%r10),%xmm13
+
+ movdqa 0-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 102,68,15,56,0,219
+ movdqa %xmm2,%xmm9
+ movdqa %xmm1,%xmm8
+ movups 0(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 32-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 102,68,15,56,0,227
+ leaq 64(%r10),%r10
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 64-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 102,68,15,56,0,235
+.byte 69,15,56,204,211
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 96-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+.byte 15,56,203,202
+ movdqa 128-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ cmpl $11,%r11d
+ jb .Laesenclast1
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast1
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast1:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,202
+ movups 16(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,0(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movdqa 160-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 192-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+.byte 69,15,56,204,211
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 224-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 256-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ cmpl $11,%r11d
+ jb .Laesenclast2
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast2
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast2:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,202
+ movups 32(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,16(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movdqa 288-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 320-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+.byte 69,15,56,204,211
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 352-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 384-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 416-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ cmpl $11,%r11d
+ jb .Laesenclast3
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast3
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast3:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups 48(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,32(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 448-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+ movdqa %xmm7,%xmm3
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 480-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ cmpl $11,%r11d
+ jb .Laesenclast4
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast4
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast4:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+
+ paddd %xmm9,%xmm2
+ paddd %xmm8,%xmm1
+
+ decq %rdx
+ movups %xmm6,48(%rsi,%rdi,1)
+ leaq 64(%rdi),%rdi
+ jnz .Loop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm3
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,211,8
+
+ movups %xmm6,(%r8)
+ movdqu %xmm1,(%r9)
+ movdqu %xmm2,16(%r9)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext
diff --git a/secure/lib/libcrypto/amd64/chacha-x86_64.S b/secure/lib/libcrypto/amd64/chacha-x86_64.S
index 0b3d5b8b6db4..b01c1b87d47b 100644
--- a/secure/lib/libcrypto/amd64/chacha-x86_64.S
+++ b/secure/lib/libcrypto/amd64/chacha-x86_64.S
@@ -331,6 +331,8 @@ ChaCha20_ssse3:
.LChaCha20_ssse3:
movq %rsp,%r9
.cfi_def_cfa_register %r9
+ testl $2048,%r10d
+ jnz .LChaCha20_4xop
cmpq $128,%rdx
je .LChaCha20_128
ja .LChaCha20_4x
@@ -626,6 +628,9 @@ ChaCha20_4x:
movq %rsp,%r9
.cfi_def_cfa_register %r9
movq %r10,%r11
+ shrq $32,%r10
+ testq $32,%r10
+ jnz .LChaCha20_8x
cmpq $192,%rdx
ja .Lproceed4x
@@ -1167,3 +1172,1024 @@ ChaCha20_4x:
.byte 0xf3,0xc3
.cfi_endproc
.size ChaCha20_4x,.-ChaCha20_4x
+.type ChaCha20_4xop,@function
+.align 32
+ChaCha20_4xop:
+.cfi_startproc
+.LChaCha20_4xop:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $0x140+8,%rsp
+ vzeroupper
+
+ vmovdqa .Lsigma(%rip),%xmm11
+ vmovdqu (%rcx),%xmm3
+ vmovdqu 16(%rcx),%xmm15
+ vmovdqu (%r8),%xmm7
+ leaq 256(%rsp),%rcx
+
+ vpshufd $0x00,%xmm11,%xmm8
+ vpshufd $0x55,%xmm11,%xmm9
+ vmovdqa %xmm8,64(%rsp)
+ vpshufd $0xaa,%xmm11,%xmm10
+ vmovdqa %xmm9,80(%rsp)
+ vpshufd $0xff,%xmm11,%xmm11
+ vmovdqa %xmm10,96(%rsp)
+ vmovdqa %xmm11,112(%rsp)
+
+ vpshufd $0x00,%xmm3,%xmm0
+ vpshufd $0x55,%xmm3,%xmm1
+ vmovdqa %xmm0,128-256(%rcx)
+ vpshufd $0xaa,%xmm3,%xmm2
+ vmovdqa %xmm1,144-256(%rcx)
+ vpshufd $0xff,%xmm3,%xmm3
+ vmovdqa %xmm2,160-256(%rcx)
+ vmovdqa %xmm3,176-256(%rcx)
+
+ vpshufd $0x00,%xmm15,%xmm12
+ vpshufd $0x55,%xmm15,%xmm13
+ vmovdqa %xmm12,192-256(%rcx)
+ vpshufd $0xaa,%xmm15,%xmm14
+ vmovdqa %xmm13,208-256(%rcx)
+ vpshufd $0xff,%xmm15,%xmm15
+ vmovdqa %xmm14,224-256(%rcx)
+ vmovdqa %xmm15,240-256(%rcx)
+
+ vpshufd $0x00,%xmm7,%xmm4
+ vpshufd $0x55,%xmm7,%xmm5
+ vpaddd .Linc(%rip),%xmm4,%xmm4
+ vpshufd $0xaa,%xmm7,%xmm6
+ vmovdqa %xmm5,272-256(%rcx)
+ vpshufd $0xff,%xmm7,%xmm7
+ vmovdqa %xmm6,288-256(%rcx)
+ vmovdqa %xmm7,304-256(%rcx)
+
+ jmp .Loop_enter4xop
+
+.align 32
+.Loop_outer4xop:
+ vmovdqa 64(%rsp),%xmm8
+ vmovdqa 80(%rsp),%xmm9
+ vmovdqa 96(%rsp),%xmm10
+ vmovdqa 112(%rsp),%xmm11
+ vmovdqa 128-256(%rcx),%xmm0
+ vmovdqa 144-256(%rcx),%xmm1
+ vmovdqa 160-256(%rcx),%xmm2
+ vmovdqa 176-256(%rcx),%xmm3
+ vmovdqa 192-256(%rcx),%xmm12
+ vmovdqa 208-256(%rcx),%xmm13
+ vmovdqa 224-256(%rcx),%xmm14
+ vmovdqa 240-256(%rcx),%xmm15
+ vmovdqa 256-256(%rcx),%xmm4
+ vmovdqa 272-256(%rcx),%xmm5
+ vmovdqa 288-256(%rcx),%xmm6
+ vmovdqa 304-256(%rcx),%xmm7
+ vpaddd .Lfour(%rip),%xmm4,%xmm4
+
+.Loop_enter4xop:
+ movl $10,%eax
+ vmovdqa %xmm4,256-256(%rcx)
+ jmp .Loop4xop
+
+.align 32
+.Loop4xop:
+ vpaddd %xmm0,%xmm8,%xmm8
+ vpaddd %xmm1,%xmm9,%xmm9
+ vpaddd %xmm2,%xmm10,%xmm10
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor %xmm4,%xmm8,%xmm4
+ vpxor %xmm5,%xmm9,%xmm5
+ vpxor %xmm6,%xmm10,%xmm6
+ vpxor %xmm7,%xmm11,%xmm7
+.byte 143,232,120,194,228,16
+.byte 143,232,120,194,237,16
+.byte 143,232,120,194,246,16
+.byte 143,232,120,194,255,16
+ vpaddd %xmm4,%xmm12,%xmm12
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm15,%xmm15
+ vpxor %xmm0,%xmm12,%xmm0
+ vpxor %xmm1,%xmm13,%xmm1
+ vpxor %xmm14,%xmm2,%xmm2
+ vpxor %xmm15,%xmm3,%xmm3
+.byte 143,232,120,194,192,12
+.byte 143,232,120,194,201,12
+.byte 143,232,120,194,210,12
+.byte 143,232,120,194,219,12
+ vpaddd %xmm8,%xmm0,%xmm8
+ vpaddd %xmm9,%xmm1,%xmm9
+ vpaddd %xmm2,%xmm10,%xmm10
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor %xmm4,%xmm8,%xmm4
+ vpxor %xmm5,%xmm9,%xmm5
+ vpxor %xmm6,%xmm10,%xmm6
+ vpxor %xmm7,%xmm11,%xmm7
+.byte 143,232,120,194,228,8
+.byte 143,232,120,194,237,8
+.byte 143,232,120,194,246,8
+.byte 143,232,120,194,255,8
+ vpaddd %xmm4,%xmm12,%xmm12
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm15,%xmm15
+ vpxor %xmm0,%xmm12,%xmm0
+ vpxor %xmm1,%xmm13,%xmm1
+ vpxor %xmm14,%xmm2,%xmm2
+ vpxor %xmm15,%xmm3,%xmm3
+.byte 143,232,120,194,192,7
+.byte 143,232,120,194,201,7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,219,7
+ vpaddd %xmm1,%xmm8,%xmm8
+ vpaddd %xmm2,%xmm9,%xmm9
+ vpaddd %xmm3,%xmm10,%xmm10
+ vpaddd %xmm0,%xmm11,%xmm11
+ vpxor %xmm7,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm4
+ vpxor %xmm5,%xmm10,%xmm5
+ vpxor %xmm6,%xmm11,%xmm6
+.byte 143,232,120,194,255,16
+.byte 143,232,120,194,228,16
+.byte 143,232,120,194,237,16
+.byte 143,232,120,194,246,16
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpaddd %xmm4,%xmm15,%xmm15
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpxor %xmm1,%xmm14,%xmm1
+ vpxor %xmm2,%xmm15,%xmm2
+ vpxor %xmm12,%xmm3,%xmm3
+ vpxor %xmm13,%xmm0,%xmm0
+.byte 143,232,120,194,201,12
+.byte 143,232,120,194,210,12
+.byte 143,232,120,194,219,12
+.byte 143,232,120,194,192,12
+ vpaddd %xmm8,%xmm1,%xmm8
+ vpaddd %xmm9,%xmm2,%xmm9
+ vpaddd %xmm3,%xmm10,%xmm10
+ vpaddd %xmm0,%xmm11,%xmm11
+ vpxor %xmm7,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm4
+ vpxor %xmm5,%xmm10,%xmm5
+ vpxor %xmm6,%xmm11,%xmm6
+.byte 143,232,120,194,255,8
+.byte 143,232,120,194,228,8
+.byte 143,232,120,194,237,8
+.byte 143,232,120,194,246,8
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpaddd %xmm4,%xmm15,%xmm15
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpxor %xmm1,%xmm14,%xmm1
+ vpxor %xmm2,%xmm15,%xmm2
+ vpxor %xmm12,%xmm3,%xmm3
+ vpxor %xmm13,%xmm0,%xmm0
+.byte 143,232,120,194,201,7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,192,7
+ decl %eax
+ jnz .Loop4xop
+
+ vpaddd 64(%rsp),%xmm8,%xmm8
+ vpaddd 80(%rsp),%xmm9,%xmm9
+ vpaddd 96(%rsp),%xmm10,%xmm10
+ vpaddd 112(%rsp),%xmm11,%xmm11
+
+ vmovdqa %xmm14,32(%rsp)
+ vmovdqa %xmm15,48(%rsp)
+
+ vpunpckldq %xmm9,%xmm8,%xmm14
+ vpunpckldq %xmm11,%xmm10,%xmm15
+ vpunpckhdq %xmm9,%xmm8,%xmm8
+ vpunpckhdq %xmm11,%xmm10,%xmm10
+ vpunpcklqdq %xmm15,%xmm14,%xmm9
+ vpunpckhqdq %xmm15,%xmm14,%xmm14
+ vpunpcklqdq %xmm10,%xmm8,%xmm11
+ vpunpckhqdq %xmm10,%xmm8,%xmm8
+ vpaddd 128-256(%rcx),%xmm0,%xmm0
+ vpaddd 144-256(%rcx),%xmm1,%xmm1
+ vpaddd 160-256(%rcx),%xmm2,%xmm2
+ vpaddd 176-256(%rcx),%xmm3,%xmm3
+
+ vmovdqa %xmm9,0(%rsp)
+ vmovdqa %xmm14,16(%rsp)
+ vmovdqa 32(%rsp),%xmm9
+ vmovdqa 48(%rsp),%xmm14
+
+ vpunpckldq %xmm1,%xmm0,%xmm10
+ vpunpckldq %xmm3,%xmm2,%xmm15
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm15,%xmm10,%xmm1
+ vpunpckhqdq %xmm15,%xmm10,%xmm10
+ vpunpcklqdq %xmm2,%xmm0,%xmm3
+ vpunpckhqdq %xmm2,%xmm0,%xmm0
+ vpaddd 192-256(%rcx),%xmm12,%xmm12
+ vpaddd 208-256(%rcx),%xmm13,%xmm13
+ vpaddd 224-256(%rcx),%xmm9,%xmm9
+ vpaddd 240-256(%rcx),%xmm14,%xmm14
+
+ vpunpckldq %xmm13,%xmm12,%xmm2
+ vpunpckldq %xmm14,%xmm9,%xmm15
+ vpunpckhdq %xmm13,%xmm12,%xmm12
+ vpunpckhdq %xmm14,%xmm9,%xmm9
+ vpunpcklqdq %xmm15,%xmm2,%xmm13
+ vpunpckhqdq %xmm15,%xmm2,%xmm2
+ vpunpcklqdq %xmm9,%xmm12,%xmm14
+ vpunpckhqdq %xmm9,%xmm12,%xmm12
+ vpaddd 256-256(%rcx),%xmm4,%xmm4
+ vpaddd 272-256(%rcx),%xmm5,%xmm5
+ vpaddd 288-256(%rcx),%xmm6,%xmm6
+ vpaddd 304-256(%rcx),%xmm7,%xmm7
+
+ vpunpckldq %xmm5,%xmm4,%xmm9
+ vpunpckldq %xmm7,%xmm6,%xmm15
+ vpunpckhdq %xmm5,%xmm4,%xmm4
+ vpunpckhdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm15,%xmm9,%xmm5
+ vpunpckhqdq %xmm15,%xmm9,%xmm9
+ vpunpcklqdq %xmm6,%xmm4,%xmm7
+ vpunpckhqdq %xmm6,%xmm4,%xmm4
+ vmovdqa 0(%rsp),%xmm6
+ vmovdqa 16(%rsp),%xmm15
+
+ cmpq $256,%rdx
+ jb .Ltail4xop
+
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ vpxor 0(%rsi),%xmm11,%xmm11
+ vpxor 16(%rsi),%xmm3,%xmm3
+ vpxor 32(%rsi),%xmm14,%xmm14
+ vpxor 48(%rsi),%xmm7,%xmm7
+ vpxor 64(%rsi),%xmm8,%xmm8
+ vpxor 80(%rsi),%xmm0,%xmm0
+ vpxor 96(%rsi),%xmm12,%xmm12
+ vpxor 112(%rsi),%xmm4,%xmm4
+ leaq 128(%rsi),%rsi
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ leaq 128(%rdi),%rdi
+ vmovdqu %xmm11,0(%rdi)
+ vmovdqu %xmm3,16(%rdi)
+ vmovdqu %xmm14,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ vmovdqu %xmm8,64(%rdi)
+ vmovdqu %xmm0,80(%rdi)
+ vmovdqu %xmm12,96(%rdi)
+ vmovdqu %xmm4,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4xop
+
+ jmp .Ldone4xop
+
+.align 32
+.Ltail4xop:
+ cmpq $192,%rdx
+ jae .L192_or_more4xop
+ cmpq $128,%rdx
+ jae .L128_or_more4xop
+ cmpq $64,%rdx
+ jae .L64_or_more4xop
+
+ xorq %r10,%r10
+ vmovdqa %xmm6,0(%rsp)
+ vmovdqa %xmm1,16(%rsp)
+ vmovdqa %xmm13,32(%rsp)
+ vmovdqa %xmm5,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L64_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ je .Ldone4xop
+
+ leaq 64(%rsi),%rsi
+ vmovdqa %xmm15,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm10,16(%rsp)
+ leaq 64(%rdi),%rdi
+ vmovdqa %xmm2,32(%rsp)
+ subq $64,%rdx
+ vmovdqa %xmm9,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L128_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ je .Ldone4xop
+
+ leaq 128(%rsi),%rsi
+ vmovdqa %xmm11,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm3,16(%rsp)
+ leaq 128(%rdi),%rdi
+ vmovdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ vmovdqa %xmm7,48(%rsp)
+ jmp .Loop_tail4xop
+
+.align 32
+.L192_or_more4xop:
+ vpxor 0(%rsi),%xmm6,%xmm6
+ vpxor 16(%rsi),%xmm1,%xmm1
+ vpxor 32(%rsi),%xmm13,%xmm13
+ vpxor 48(%rsi),%xmm5,%xmm5
+ vpxor 64(%rsi),%xmm15,%xmm15
+ vpxor 80(%rsi),%xmm10,%xmm10
+ vpxor 96(%rsi),%xmm2,%xmm2
+ vpxor 112(%rsi),%xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ vpxor 0(%rsi),%xmm11,%xmm11
+ vpxor 16(%rsi),%xmm3,%xmm3
+ vpxor 32(%rsi),%xmm14,%xmm14
+ vpxor 48(%rsi),%xmm7,%xmm7
+
+ vmovdqu %xmm6,0(%rdi)
+ vmovdqu %xmm1,16(%rdi)
+ vmovdqu %xmm13,32(%rdi)
+ vmovdqu %xmm5,48(%rdi)
+ vmovdqu %xmm15,64(%rdi)
+ vmovdqu %xmm10,80(%rdi)
+ vmovdqu %xmm2,96(%rdi)
+ vmovdqu %xmm9,112(%rdi)
+ leaq 128(%rdi),%rdi
+ vmovdqu %xmm11,0(%rdi)
+ vmovdqu %xmm3,16(%rdi)
+ vmovdqu %xmm14,32(%rdi)
+ vmovdqu %xmm7,48(%rdi)
+ je .Ldone4xop
+
+ leaq 64(%rsi),%rsi
+ vmovdqa %xmm8,0(%rsp)
+ xorq %r10,%r10
+ vmovdqa %xmm0,16(%rsp)
+ leaq 64(%rdi),%rdi
+ vmovdqa %xmm12,32(%rsp)
+ subq $192,%rdx
+ vmovdqa %xmm4,48(%rsp)
+
+.Loop_tail4xop:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4xop
+
+.Ldone4xop:
+ vzeroupper
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L4xop_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_4xop,.-ChaCha20_4xop
+.type ChaCha20_8x,@function
+.align 32
+ChaCha20_8x:
+.cfi_startproc
+.LChaCha20_8x:
+ movq %rsp,%r9
+.cfi_def_cfa_register %r9
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register %rsp
+.L8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_8x,.-ChaCha20_8x
diff --git a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
index c69b4d978f39..df18fa496de4 100644
--- a/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S
@@ -2790,6 +2790,10 @@ ecp_nistz256_neg:
.align 32
ecp_nistz256_ord_mul_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_mul_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3118,6 +3122,10 @@ ecp_nistz256_ord_mul_mont:
.align 32
ecp_nistz256_ord_sqr_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_sqr_montx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3405,6 +3413,462 @@ ecp_nistz256_ord_sqr_mont:
.cfi_endproc
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_mul_montx,@function
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq .Lord-128(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,@function
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq .Lord(%rip),%rsi
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz .Loop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
@@ -3413,6 +3877,8 @@ ecp_nistz256_ord_sqr_mont:
.align 32
ecp_nistz256_to_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
leaq .LRR(%rip),%rdx
jmp .Lmul_mont
.cfi_endproc
@@ -3429,6 +3895,8 @@ ecp_nistz256_to_mont:
.align 32
ecp_nistz256_mul_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
.Lmul_mont:
pushq %rbp
.cfi_adjust_cfa_offset 8
@@ -3449,6 +3917,8 @@ ecp_nistz256_mul_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lmul_body:
+ cmpl $0x80100,%ecx
+ je .Lmul_montx
movq %rdx,%rbx
movq 0(%rdx),%rax
movq 0(%rsi),%r9
@@ -3457,6 +3927,19 @@ ecp_nistz256_mul_mont:
movq 24(%rsi),%r12
call __ecp_nistz256_mul_montq
+ jmp .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_mul_montx
.Lmul_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -3707,6 +4190,8 @@ __ecp_nistz256_mul_montq:
.align 32
ecp_nistz256_sqr_mont:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -3726,12 +4211,25 @@ ecp_nistz256_sqr_mont:
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lsqr_body:
+ cmpl $0x80100,%ecx
+ je .Lsqr_montx
movq 0(%rsi),%rax
movq 8(%rsi),%r14
movq 16(%rsi),%r15
movq 24(%rsi),%r8
call __ecp_nistz256_sqr_montq
+ jmp .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_sqr_montx
.Lsqr_mont_done:
movq 0(%rsp),%r15
.cfi_restore %r15
@@ -3915,6 +4413,304 @@ __ecp_nistz256_sqr_montq:
.byte 0xf3,0xc3
.cfi_endproc
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type __ecp_nistz256_mul_montx,@function
+.align 32
+__ecp_nistz256_mul_montx:
+.cfi_startproc
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ movq $32,%r14
+ xorq %r13,%r13
+ mulxq %r11,%rbp,%r11
+ movq .Lpoly+24(%rip),%r15
+ adcq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ adcq %rbp,%r10
+ shlxq %r14,%r8,%rbp
+ adcq %rcx,%r11
+ shrxq %r14,%r8,%rcx
+ adcq $0,%r12
+
+
+
+ addq %rbp,%r9
+ adcq %rcx,%r10
+
+ mulxq %r15,%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcq %rcx,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ adcxq %rcx,%r12
+ shlxq %r14,%r9,%rcx
+ adoxq %rbp,%r13
+ shrxq %r14,%r9,%rbp
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+
+ addq %rcx,%r10
+ adcq %rbp,%r11
+
+ mulxq %r15,%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcq %rcx,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ adcxq %rcx,%r13
+ shlxq %r14,%r10,%rcx
+ adoxq %rbp,%r8
+ shrxq %r14,%r10,%rbp
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+
+ addq %rcx,%r11
+ adcq %rbp,%r12
+
+ mulxq %r15,%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcq %rcx,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ adcxq %rcx,%r8
+ shlxq %r14,%r11,%rcx
+ adoxq %rbp,%r9
+ shrxq %r14,%r11,%rbp
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+
+ addq %rcx,%r12
+ adcq %rbp,%r13
+
+ mulxq %r15,%rcx,%rbp
+ movq %r12,%rbx
+ movq .Lpoly+8(%rip),%r14
+ adcq %rcx,%r8
+ movq %r13,%rdx
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+
+ xorl %eax,%eax
+ movq %r8,%rcx
+ sbbq $-1,%r12
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rbp
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rbp,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type __ecp_nistz256_sqr_montx,@function
+.align 32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ xorl %eax,%eax
+ adcq %rcx,%r10
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+
+ mulxq %r8,%rcx,%r14
+ movq 0+128(%rsi),%rdx
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+ mulxq %rdx,%r8,%rbp
+ movq 8+128(%rsi),%rdx
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+ movq 16+128(%rsi),%rdx
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+.byte 0x67
+ mulxq %rdx,%rcx,%rbp
+ movq 24+128(%rsi),%rdx
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ movq $32,%rsi
+ adoxq %rbp,%r13
+.byte 0x67,0x67
+ mulxq %rdx,%rcx,%rax
+ movq .Lpoly+24(%rip),%rdx
+ adoxq %rcx,%r14
+ shlxq %rsi,%r8,%rcx
+ adoxq %rax,%r15
+ shrxq %rsi,%r8,%rax
+ movq %rdx,%rbp
+
+
+ addq %rcx,%r9
+ adcq %rax,%r10
+
+ mulxq %r8,%rcx,%r8
+ adcq %rcx,%r11
+ shlxq %rsi,%r9,%rcx
+ adcq $0,%r8
+ shrxq %rsi,%r9,%rax
+
+
+ addq %rcx,%r10
+ adcq %rax,%r11
+
+ mulxq %r9,%rcx,%r9
+ adcq %rcx,%r8
+ shlxq %rsi,%r10,%rcx
+ adcq $0,%r9
+ shrxq %rsi,%r10,%rax
+
+
+ addq %rcx,%r11
+ adcq %rax,%r8
+
+ mulxq %r10,%rcx,%r10
+ adcq %rcx,%r9
+ shlxq %rsi,%r11,%rcx
+ adcq $0,%r10
+ shrxq %rsi,%r11,%rax
+
+
+ addq %rcx,%r8
+ adcq %rax,%r9
+
+ mulxq %r11,%rcx,%r11
+ adcq %rcx,%r10
+ adcq $0,%r11
+
+ xorq %rdx,%rdx
+ addq %r8,%r12
+ movq .Lpoly+8(%rip),%rsi
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %r11,%r15
+ movq %r13,%r9
+ adcq $0,%rdx
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%r11
+ sbbq %rbp,%r15
+ sbbq $0,%rdx
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %r11,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
@@ -4056,6 +4852,9 @@ ecp_nistz256_scatter_w5:
.align 32
ecp_nistz256_gather_w5:
.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ testl $32,%eax
+ jnz .Lavx2_gather_w5
movdqa .LOne(%rip),%xmm0
movd %edx,%xmm1
@@ -4139,6 +4938,9 @@ ecp_nistz256_scatter_w7:
.align 32
ecp_nistz256_gather_w7:
.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ testl $32,%eax
+ jnz .Lavx2_gather_w7
movdqa .LOne(%rip),%xmm8
movd %edx,%xmm1
@@ -4182,14 +4984,148 @@ ecp_nistz256_gather_w7:
.cfi_endproc
.LSEH_end_ecp_nistz256_gather_w7:
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
+
+
+.type ecp_nistz256_avx2_gather_w5,@function
+.align 32
+ecp_nistz256_avx2_gather_w5:
+.cfi_startproc
+.Lavx2_gather_w5:
+ vzeroupper
+ vmovdqa .LTwo(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+ vpxor %ymm4,%ymm4,%ymm4
+
+ vmovdqa .LOne(%rip),%ymm5
+ vmovdqa .LTwo(%rip),%ymm10
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+ movq $8,%rax
+.Lselect_loop_avx2_w5:
+
+ vmovdqa 0(%rsi),%ymm6
+ vmovdqa 32(%rsi),%ymm7
+ vmovdqa 64(%rsi),%ymm8
+
+ vmovdqa 96(%rsi),%ymm11
+ vmovdqa 128(%rsi),%ymm12
+ vmovdqa 160(%rsi),%ymm13
+
+ vpcmpeqd %ymm1,%ymm5,%ymm9
+ vpcmpeqd %ymm1,%ymm10,%ymm14
+
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm10,%ymm10
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm9,%ymm6,%ymm6
+ vpand %ymm9,%ymm7,%ymm7
+ vpand %ymm9,%ymm8,%ymm8
+ vpand %ymm14,%ymm11,%ymm11
+ vpand %ymm14,%ymm12,%ymm12
+ vpand %ymm14,%ymm13,%ymm13
+
+ vpxor %ymm6,%ymm2,%ymm2
+ vpxor %ymm7,%ymm3,%ymm3
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm11,%ymm2,%ymm2
+ vpxor %ymm12,%ymm3,%ymm3
+ vpxor %ymm13,%ymm4,%ymm4
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w5
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_gather_w5:
+.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5
+
+
+
.globl ecp_nistz256_avx2_gather_w7
.type ecp_nistz256_avx2_gather_w7,@function
.align 32
ecp_nistz256_avx2_gather_w7:
.cfi_startproc
-.byte 0x0f,0x0b
+.Lavx2_gather_w7:
+ vzeroupper
+ vmovdqa .LThree(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+
+ vmovdqa .LOne(%rip),%ymm4
+ vmovdqa .LTwo(%rip),%ymm8
+ vmovdqa .LThree(%rip),%ymm12
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+
+ movq $21,%rax
+.Lselect_loop_avx2_w7:
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vmovdqa 64(%rsi),%ymm9
+ vmovdqa 96(%rsi),%ymm10
+
+ vmovdqa 128(%rsi),%ymm13
+ vmovdqa 160(%rsi),%ymm14
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+ vpcmpeqd %ymm1,%ymm8,%ymm11
+ vpcmpeqd %ymm1,%ymm12,%ymm15
+
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpaddd %ymm0,%ymm12,%ymm12
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm11,%ymm9,%ymm9
+ vpand %ymm11,%ymm10,%ymm10
+ vpand %ymm15,%ymm13,%ymm13
+ vpand %ymm15,%ymm14,%ymm14
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+ vpxor %ymm9,%ymm2,%ymm2
+ vpxor %ymm10,%ymm3,%ymm3
+ vpxor %ymm13,%ymm2,%ymm2
+ vpxor %ymm14,%ymm3,%ymm3
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w7
+
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_gather_w7:
.size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7
.type __ecp_nistz256_add_toq,@function
.align 32
@@ -4325,6 +5261,10 @@ __ecp_nistz256_mul_by_2q:
.align 32
ecp_nistz256_point_double:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_doublex
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -4553,6 +5493,10 @@ ecp_nistz256_point_double:
.align 32
ecp_nistz256_point_add:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_addx
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -4967,6 +5911,10 @@ ecp_nistz256_point_add:
.align 32
ecp_nistz256_point_add_affine:
.cfi_startproc
+ movl $0x80100,%ecx
+ andl OPENSSL_ia32cap_P+8(%rip),%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_add_affinex
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-16
@@ -5290,3 +6238,1108 @@ ecp_nistz256_point_add_affine:
.byte 0xf3,0xc3
.cfi_endproc
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type __ecp_nistz256_add_tox,@function
+.align 32
+__ecp_nistz256_add_tox:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type __ecp_nistz256_sub_fromx,@function
+.align 32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq $0,%r11
+
+ xorq %r10,%r10
+ adcq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+
+ btq $0,%r11
+ cmovncq %rax,%r12
+ cmovncq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovncq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovncq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type __ecp_nistz256_subx,@function
+.align 32
+__ecp_nistz256_subx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq $0,%r11
+
+ xorq %r9,%r9
+ adcq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+
+ btq $0,%r11
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ cmovcq %rcx,%r8
+ cmovcq %r10,%r9
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type __ecp_nistz256_mul_by_2x,@function
+.align 32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type ecp_nistz256_point_doublex,@function
+.align 32
+ecp_nistz256_point_doublex:
+.cfi_startproc
+.Lpoint_doublex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-128(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 32(%rbx),%rdx
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rdx
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 0+32(%rsp),%rdx
+ movq 8+32(%rsp),%r14
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subx
+
+ movq 32(%rsp),%rdx
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-128(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doublex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type ecp_nistz256_point_addx,@function
+.align 32
+ecp_nistz256_point_addx:
+.cfi_startproc
+.Lpoint_addx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addx_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-128(%rsi),%rsi
+ movq %rdx,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rdx
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 416(%rsp),%rdx
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 512(%rsp),%rdx
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq -128+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 480(%rsp),%rdx
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+
+ orq %r8,%r12
+ orq %r9,%r12
+
+
+.byte 0x3e
+ jnz .Ladd_proceedx
+
+.Ladd_doublex:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+.cfi_adjust_cfa_offset -416
+ jmp .Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset 416
+
+.align 32
+.Ladd_proceedx:
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq -128+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0(%rsp),%rdx
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 160(%rsp),%rdx
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_donex:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type ecp_nistz256_point_add_affinex,@function
+.align 32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc
+.Lpoint_add_affinex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affinex_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-128(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rdx
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-128(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+96(%rsp),%rdx
+ movq 8+96(%rsp),%r14
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq -128+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rdx
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq -128+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affinex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
diff --git a/secure/lib/libcrypto/amd64/ghash-x86_64.S b/secure/lib/libcrypto/amd64/ghash-x86_64.S
index 55ad7db1f240..078353528d5f 100644
--- a/secure/lib/libcrypto/amd64/ghash-x86_64.S
+++ b/secure/lib/libcrypto/amd64/ghash-x86_64.S
@@ -1304,7 +1304,108 @@ gcm_ghash_clmul:
.align 32
gcm_init_avx:
.cfi_startproc
- jmp .L_init_clmul
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
.cfi_endproc
.size gcm_init_avx,.-gcm_init_avx
.globl gcm_gmult_avx
@@ -1320,7 +1421,377 @@ gcm_gmult_avx:
.align 32
gcm_ghash_avx:
.cfi_startproc
- jmp .L_ghash_clmul
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
.cfi_endproc
.size gcm_ghash_avx,.-gcm_ghash_avx
.align 64
diff --git a/secure/lib/libcrypto/amd64/poly1305-x86_64.S b/secure/lib/libcrypto/amd64/poly1305-x86_64.S
index d74ee9b45052..c5a1f45fc5de 100644
--- a/secure/lib/libcrypto/amd64/poly1305-x86_64.S
+++ b/secure/lib/libcrypto/amd64/poly1305-x86_64.S
@@ -25,6 +25,15 @@ poly1305_init:
leaq poly1305_blocks(%rip),%r10
leaq poly1305_emit(%rip),%r11
+ movq OPENSSL_ia32cap_P+4(%rip),%r9
+ leaq poly1305_blocks_avx(%rip),%rax
+ leaq poly1305_emit_avx(%rip),%rcx
+ btq $28,%r9
+ cmovcq %rax,%r10
+ cmovcq %rcx,%r11
+ leaq poly1305_blocks_avx2(%rip),%rax
+ btq $37,%r9
+ cmovcq %rax,%r10
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
@@ -180,6 +189,1782 @@ poly1305_emit:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_emit,.-poly1305_emit
+.type __poly1305_block,@function
+.align 32
+__poly1305_block:
+.cfi_startproc
+ mulq %r14
+ movq %rax,%r9
+ movq %r11,%rax
+ movq %rdx,%r10
+
+ mulq %r14
+ movq %rax,%r14
+ movq %r11,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq %r13,%rax
+ adcq %rdx,%r10
+
+ mulq %rbx
+ movq %rbp,%rbx
+ addq %rax,%r14
+ adcq %rdx,%r8
+
+ imulq %r13,%rbx
+ addq %rbx,%r9
+ movq %r8,%rbx
+ adcq $0,%r10
+
+ imulq %r11,%rbp
+ addq %r9,%rbx
+ movq $-4,%rax
+ adcq %rbp,%r10
+
+ andq %r10,%rax
+ movq %r10,%rbp
+ shrq $2,%r10
+ andq $3,%rbp
+ addq %r10,%rax
+ addq %rax,%r14
+ adcq $0,%rbx
+ adcq $0,%rbp
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __poly1305_block,.-__poly1305_block
+
+.type __poly1305_init_avx,@function
+.align 32
+__poly1305_init_avx:
+.cfi_startproc
+ movq %r11,%r14
+ movq %r12,%rbx
+ xorq %rbp,%rbp
+
+ leaq 48+64(%rdi),%rdi
+
+ movq %r12,%rax
+ call __poly1305_block
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ movq %r14,%r8
+ andl %r14d,%eax
+ movq %r11,%r9
+ andl %r11d,%edx
+ movl %eax,-64(%rdi)
+ shrq $26,%r8
+ movl %edx,-60(%rdi)
+ shrq $26,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,-48(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-44(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,-32(%rdi)
+ shrq $26,%r8
+ movl %edx,-28(%rdi)
+ shrq $26,%r9
+
+ movq %rbx,%rax
+ movq %r12,%rdx
+ shlq $12,%rax
+ shlq $12,%rdx
+ orq %r8,%rax
+ orq %r9,%rdx
+ andl $0x3ffffff,%eax
+ andl $0x3ffffff,%edx
+ movl %eax,-16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,-12(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,0(%rdi)
+ movq %rbx,%r8
+ movl %edx,4(%rdi)
+ movq %r12,%r9
+
+ movl $0x3ffffff,%eax
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ shrq $14,%r9
+ andl %r8d,%eax
+ andl %r9d,%edx
+ movl %eax,16(%rdi)
+ leal (%rax,%rax,4),%eax
+ movl %edx,20(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ movl %eax,32(%rdi)
+ shrq $26,%r8
+ movl %edx,36(%rdi)
+ shrq $26,%r9
+
+ movq %rbp,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,48(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r9d,52(%rdi)
+ leaq (%r9,%r9,4),%r9
+ movl %r8d,64(%rdi)
+ movl %r9d,68(%rdi)
+
+ movq %r12,%rax
+ call __poly1305_block
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-52(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-36(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-20(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-4(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,12(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,28(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,44(%rdi)
+
+ movq %rbp,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,60(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,76(%rdi)
+
+ movq %r12,%rax
+ call __poly1305_block
+
+ movl $0x3ffffff,%eax
+ movq %r14,%r8
+ andl %r14d,%eax
+ shrq $26,%r8
+ movl %eax,-56(%rdi)
+
+ movl $0x3ffffff,%edx
+ andl %r8d,%edx
+ movl %edx,-40(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,-24(%rdi)
+
+ movq %rbx,%rax
+ shlq $12,%rax
+ orq %r8,%rax
+ andl $0x3ffffff,%eax
+ movl %eax,-8(%rdi)
+ leal (%rax,%rax,4),%eax
+ movq %rbx,%r8
+ movl %eax,8(%rdi)
+
+ movl $0x3ffffff,%edx
+ shrq $14,%r8
+ andl %r8d,%edx
+ movl %edx,24(%rdi)
+ leal (%rdx,%rdx,4),%edx
+ shrq $26,%r8
+ movl %edx,40(%rdi)
+
+ movq %rbp,%rax
+ shlq $24,%rax
+ orq %rax,%r8
+ movl %r8d,56(%rdi)
+ leaq (%r8,%r8,4),%r8
+ movl %r8d,72(%rdi)
+
+ leaq -48-64(%rdi),%rdi
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __poly1305_init_avx,.-__poly1305_init_avx
+
+.type poly1305_blocks_avx,@function
+.align 32
+poly1305_blocks_avx:
+.cfi_startproc
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ andq $-16,%rdx
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ testq $31,%rdx
+ jz .Leven_avx
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lblocks_avx_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%ebp
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %rbp,%r8
+ shlq $40,%r8
+ shrq $24,%rbp
+ addq %r8,%rbx
+ adcq $0,%rbp
+
+ movq $-4,%r9
+ movq %rbp,%r8
+ andq %rbp,%r9
+ shrq $2,%r8
+ andq $3,%rbp
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%rbp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+
+ call __poly1305_block
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%rbp
+
+ subq $16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rbp,16(%rdi)
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %ebp,16(%rdi)
+.align 16
+.Ldone_avx:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx:
+.cfi_startproc
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lbase2_64_avx_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%ebp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $31,%rdx
+ jz .Linit_avx
+
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+ subq $16,%r15
+
+ call __poly1305_block
+
+.Linit_avx:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%rbp
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ movl $1,20(%rdi)
+
+ call __poly1305_init_avx
+
+.Lproceed_avx:
+ movq %r15,%rdx
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+.cfi_endproc
+
+.align 32
+.Leven_avx:
+.cfi_startproc
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx:
+ leaq -88(%rsp),%r11
+.cfi_def_cfa %r11,0x60
+ subq $0x178,%rsp
+ subq $64,%rdx
+ leaq -32(%rsi),%rax
+ cmovcq %rax,%rsi
+
+ vmovdqu 48(%rdi),%xmm14
+ leaq 112(%rdi),%rdi
+ leaq .Lconst(%rip),%rcx
+
+
+
+ vmovdqu 32(%rsi),%xmm5
+ vmovdqu 48(%rsi),%xmm6
+ vmovdqa 64(%rcx),%xmm15
+
+ vpsrldq $6,%xmm5,%xmm7
+ vpsrldq $6,%xmm6,%xmm8
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+ vpsrlq $40,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+ jbe .Lskip_loop_avx
+
+
+ vmovdqu -48(%rdi),%xmm11
+ vmovdqu -32(%rdi),%xmm12
+ vpshufd $0xEE,%xmm14,%xmm13
+ vpshufd $0x44,%xmm14,%xmm10
+ vmovdqa %xmm13,-144(%r11)
+ vmovdqa %xmm10,0(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vmovdqu -16(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-128(%r11)
+ vmovdqa %xmm11,16(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqu 0(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-112(%r11)
+ vmovdqa %xmm12,32(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm14
+ vmovdqu 16(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm14,-96(%r11)
+ vmovdqa %xmm10,48(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm13
+ vmovdqu 32(%rdi),%xmm10
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm13,-80(%r11)
+ vmovdqa %xmm11,64(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm14
+ vmovdqu 48(%rdi),%xmm11
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm14,-64(%r11)
+ vmovdqa %xmm12,80(%rsp)
+ vpshufd $0xEE,%xmm10,%xmm13
+ vmovdqu 64(%rdi),%xmm12
+ vpshufd $0x44,%xmm10,%xmm10
+ vmovdqa %xmm13,-48(%r11)
+ vmovdqa %xmm10,96(%rsp)
+ vpshufd $0xEE,%xmm11,%xmm14
+ vpshufd $0x44,%xmm11,%xmm11
+ vmovdqa %xmm14,-32(%r11)
+ vmovdqa %xmm11,112(%rsp)
+ vpshufd $0xEE,%xmm12,%xmm13
+ vmovdqa 0(%rsp),%xmm14
+ vpshufd $0x44,%xmm12,%xmm12
+ vmovdqa %xmm13,-16(%r11)
+ vmovdqa %xmm12,128(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vmovdqa %xmm2,32(%r11)
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vmovdqa 16(%rsp),%xmm2
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vmovdqa %xmm0,0(%r11)
+ vpmuludq 32(%rsp),%xmm9,%xmm0
+ vmovdqa %xmm1,16(%r11)
+ vpmuludq %xmm8,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm10,%xmm10
+ vpaddq %xmm1,%xmm14,%xmm14
+ vmovdqa %xmm3,48(%r11)
+ vpmuludq %xmm7,%xmm2,%xmm0
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpaddq %xmm0,%xmm13,%xmm13
+ vmovdqa 48(%rsp),%xmm3
+ vpaddq %xmm1,%xmm12,%xmm12
+ vmovdqa %xmm4,64(%r11)
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpmuludq %xmm7,%xmm3,%xmm0
+ vpaddq %xmm2,%xmm11,%xmm11
+
+ vmovdqa 64(%rsp),%xmm4
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm3,%xmm1
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm1,%xmm13,%xmm13
+ vmovdqa 80(%rsp),%xmm2
+ vpaddq %xmm3,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm4,%xmm0
+ vpmuludq %xmm8,%xmm4,%xmm4
+ vpaddq %xmm0,%xmm11,%xmm11
+ vmovdqa 96(%rsp),%xmm3
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vmovdqa 128(%rsp),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm1
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpaddq %xmm2,%xmm13,%xmm13
+ vpmuludq %xmm9,%xmm3,%xmm0
+ vpmuludq %xmm8,%xmm3,%xmm1
+ vpaddq %xmm0,%xmm12,%xmm12
+ vmovdqu 0(%rsi),%xmm0
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm3,%xmm3
+ vpmuludq %xmm7,%xmm4,%xmm7
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vmovdqu 16(%rsi),%xmm1
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm8,%xmm4,%xmm8
+ vpmuludq %xmm9,%xmm4,%xmm9
+ vpsrldq $6,%xmm0,%xmm2
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm9,%xmm13,%xmm13
+ vpsrldq $6,%xmm1,%xmm3
+ vpmuludq 112(%rsp),%xmm5,%xmm9
+ vpmuludq %xmm6,%xmm4,%xmm5
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpaddq %xmm9,%xmm14,%xmm14
+ vmovdqa -144(%r11),%xmm9
+ vpaddq %xmm5,%xmm10,%xmm10
+
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+
+ vpsrldq $5,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpand 0(%rcx),%xmm4,%xmm4
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+ leaq 32(%rsi),%rax
+ leaq 64(%rsi),%rsi
+ subq $64,%rdx
+ cmovcq %rax,%rsi
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vmovdqa -128(%r11),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpmuludq -112(%r11),%xmm4,%xmm5
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -96(%r11),%xmm8
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpaddq %xmm7,%xmm11,%xmm11
+
+ vmovdqa -80(%r11),%xmm9
+ vpmuludq %xmm2,%xmm8,%xmm5
+ vpmuludq %xmm1,%xmm8,%xmm6
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpaddq %xmm6,%xmm13,%xmm13
+ vmovdqa -64(%r11),%xmm7
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpmuludq %xmm4,%xmm9,%xmm5
+ vpaddq %xmm8,%xmm12,%xmm12
+ vpaddq %xmm5,%xmm11,%xmm11
+ vmovdqa -48(%r11),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm6
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vmovdqa -16(%r11),%xmm9
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpmuludq %xmm4,%xmm8,%xmm5
+ vpaddq %xmm7,%xmm13,%xmm13
+ vpaddq %xmm5,%xmm12,%xmm12
+ vmovdqu 32(%rsi),%xmm5
+ vpmuludq %xmm3,%xmm8,%xmm7
+ vpmuludq %xmm2,%xmm8,%xmm8
+ vpaddq %xmm7,%xmm11,%xmm11
+ vmovdqu 48(%rsi),%xmm6
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpmuludq %xmm2,%xmm9,%xmm2
+ vpmuludq %xmm3,%xmm9,%xmm3
+ vpsrldq $6,%xmm5,%xmm7
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm9,%xmm4
+ vpsrldq $6,%xmm6,%xmm8
+ vpaddq %xmm3,%xmm12,%xmm2
+ vpaddq %xmm4,%xmm13,%xmm3
+ vpmuludq -32(%r11),%xmm0,%xmm4
+ vpmuludq %xmm1,%xmm9,%xmm0
+ vpunpckhqdq %xmm6,%xmm5,%xmm9
+ vpaddq %xmm4,%xmm14,%xmm4
+ vpaddq %xmm0,%xmm10,%xmm0
+
+ vpunpcklqdq %xmm6,%xmm5,%xmm5
+ vpunpcklqdq %xmm8,%xmm7,%xmm8
+
+
+ vpsrldq $5,%xmm9,%xmm9
+ vpsrlq $26,%xmm5,%xmm6
+ vmovdqa 0(%rsp),%xmm14
+ vpand %xmm15,%xmm5,%xmm5
+ vpsrlq $4,%xmm8,%xmm7
+ vpand %xmm15,%xmm6,%xmm6
+ vpand 0(%rcx),%xmm9,%xmm9
+ vpsrlq $30,%xmm8,%xmm8
+ vpand %xmm15,%xmm7,%xmm7
+ vpand %xmm15,%xmm8,%xmm8
+ vpor 32(%rcx),%xmm9,%xmm9
+
+
+
+
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm11,%xmm1
+
+ vpsrlq $26,%xmm4,%xmm10
+ vpand %xmm15,%xmm4,%xmm4
+
+ vpsrlq $26,%xmm1,%xmm11
+ vpand %xmm15,%xmm1,%xmm1
+ vpaddq %xmm11,%xmm2,%xmm2
+
+ vpaddq %xmm10,%xmm0,%xmm0
+ vpsllq $2,%xmm10,%xmm10
+ vpaddq %xmm10,%xmm0,%xmm0
+
+ vpsrlq $26,%xmm2,%xmm12
+ vpand %xmm15,%xmm2,%xmm2
+ vpaddq %xmm12,%xmm3,%xmm3
+
+ vpsrlq $26,%xmm0,%xmm10
+ vpand %xmm15,%xmm0,%xmm0
+ vpaddq %xmm10,%xmm1,%xmm1
+
+ vpsrlq $26,%xmm3,%xmm13
+ vpand %xmm15,%xmm3,%xmm3
+ vpaddq %xmm13,%xmm4,%xmm4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+
+
+
+ vpshufd $0x10,%xmm14,%xmm14
+ addq $32,%rdx
+ jnz .Long_tail_avx
+
+ vpaddq %xmm2,%xmm7,%xmm7
+ vpaddq %xmm0,%xmm5,%xmm5
+ vpaddq %xmm1,%xmm6,%xmm6
+ vpaddq %xmm3,%xmm8,%xmm8
+ vpaddq %xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+ vmovdqa %xmm2,32(%r11)
+ vmovdqa %xmm0,0(%r11)
+ vmovdqa %xmm1,16(%r11)
+ vmovdqa %xmm3,48(%r11)
+ vmovdqa %xmm4,64(%r11)
+
+
+
+
+
+
+
+ vpmuludq %xmm7,%xmm14,%xmm12
+ vpmuludq %xmm5,%xmm14,%xmm10
+ vpshufd $0x10,-48(%rdi),%xmm2
+ vpmuludq %xmm6,%xmm14,%xmm11
+ vpmuludq %xmm8,%xmm14,%xmm13
+ vpmuludq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm8,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpshufd $0x10,-32(%rdi),%xmm3
+ vpmuludq %xmm7,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpshufd $0x10,-16(%rdi),%xmm4
+ vpmuludq %xmm6,%xmm2,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm11,%xmm11
+ vpmuludq %xmm9,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ vpshufd $0x10,0(%rdi),%xmm2
+ vpmuludq %xmm7,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm14,%xmm14
+ vpmuludq %xmm6,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm13,%xmm13
+ vpshufd $0x10,16(%rdi),%xmm3
+ vpmuludq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm12,%xmm12
+ vpmuludq %xmm9,%xmm2,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpshufd $0x10,32(%rdi),%xmm4
+ vpmuludq %xmm8,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm10,%xmm10
+
+ vpmuludq %xmm6,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm14,%xmm14
+ vpmuludq %xmm5,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm13,%xmm13
+ vpshufd $0x10,48(%rdi),%xmm2
+ vpmuludq %xmm9,%xmm4,%xmm1
+ vpaddq %xmm1,%xmm12,%xmm12
+ vpshufd $0x10,64(%rdi),%xmm3
+ vpmuludq %xmm8,%xmm4,%xmm0
+ vpaddq %xmm0,%xmm11,%xmm11
+ vpmuludq %xmm7,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpmuludq %xmm5,%xmm2,%xmm2
+ vpaddq %xmm2,%xmm14,%xmm14
+ vpmuludq %xmm9,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm13,%xmm13
+ vpmuludq %xmm8,%xmm3,%xmm0
+ vpaddq %xmm0,%xmm12,%xmm12
+ vpmuludq %xmm7,%xmm3,%xmm1
+ vpaddq %xmm1,%xmm11,%xmm11
+ vpmuludq %xmm6,%xmm3,%xmm3
+ vpaddq %xmm3,%xmm10,%xmm10
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+
+ vpsrldq $6,%xmm0,%xmm2
+ vpsrldq $6,%xmm1,%xmm3
+ vpunpckhqdq %xmm1,%xmm0,%xmm4
+ vpunpcklqdq %xmm1,%xmm0,%xmm0
+ vpunpcklqdq %xmm3,%xmm2,%xmm3
+
+ vpsrlq $40,%xmm4,%xmm4
+ vpsrlq $26,%xmm0,%xmm1
+ vpand %xmm15,%xmm0,%xmm0
+ vpsrlq $4,%xmm3,%xmm2
+ vpand %xmm15,%xmm1,%xmm1
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm15,%xmm2,%xmm2
+ vpand %xmm15,%xmm3,%xmm3
+ vpor 32(%rcx),%xmm4,%xmm4
+
+ vpshufd $0x32,-64(%rdi),%xmm9
+ vpaddq 0(%r11),%xmm0,%xmm0
+ vpaddq 16(%r11),%xmm1,%xmm1
+ vpaddq 32(%r11),%xmm2,%xmm2
+ vpaddq 48(%r11),%xmm3,%xmm3
+ vpaddq 64(%r11),%xmm4,%xmm4
+
+
+
+
+ vpmuludq %xmm0,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpmuludq %xmm1,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpshufd $0x32,-48(%rdi),%xmm7
+ vpmuludq %xmm3,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm4,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm14,%xmm14
+
+ vpmuludq %xmm3,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpshufd $0x32,-32(%rdi),%xmm8
+ vpmuludq %xmm2,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpshufd $0x32,-16(%rdi),%xmm9
+ vpmuludq %xmm1,%xmm7,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm11,%xmm11
+ vpmuludq %xmm4,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+ vpshufd $0x32,0(%rdi),%xmm7
+ vpmuludq %xmm2,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm14,%xmm14
+ vpmuludq %xmm1,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm13,%xmm13
+ vpshufd $0x32,16(%rdi),%xmm8
+ vpmuludq %xmm0,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm12,%xmm12
+ vpmuludq %xmm4,%xmm7,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpshufd $0x32,32(%rdi),%xmm9
+ vpmuludq %xmm3,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm10,%xmm10
+
+ vpmuludq %xmm1,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm14,%xmm14
+ vpmuludq %xmm0,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpshufd $0x32,48(%rdi),%xmm7
+ vpmuludq %xmm4,%xmm9,%xmm6
+ vpaddq %xmm6,%xmm12,%xmm12
+ vpshufd $0x32,64(%rdi),%xmm8
+ vpmuludq %xmm3,%xmm9,%xmm5
+ vpaddq %xmm5,%xmm11,%xmm11
+ vpmuludq %xmm2,%xmm9,%xmm9
+ vpaddq %xmm9,%xmm10,%xmm10
+
+ vpmuludq %xmm0,%xmm7,%xmm7
+ vpaddq %xmm7,%xmm14,%xmm14
+ vpmuludq %xmm4,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm13,%xmm13
+ vpmuludq %xmm3,%xmm8,%xmm5
+ vpaddq %xmm5,%xmm12,%xmm12
+ vpmuludq %xmm2,%xmm8,%xmm6
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpmuludq %xmm1,%xmm8,%xmm8
+ vpaddq %xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+
+
+ vpsrldq $8,%xmm14,%xmm9
+ vpsrldq $8,%xmm13,%xmm8
+ vpsrldq $8,%xmm11,%xmm6
+ vpsrldq $8,%xmm10,%xmm5
+ vpsrldq $8,%xmm12,%xmm7
+ vpaddq %xmm8,%xmm13,%xmm13
+ vpaddq %xmm9,%xmm14,%xmm14
+ vpaddq %xmm5,%xmm10,%xmm10
+ vpaddq %xmm6,%xmm11,%xmm11
+ vpaddq %xmm7,%xmm12,%xmm12
+
+
+
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm14,%xmm4
+ vpand %xmm15,%xmm14,%xmm14
+
+ vpsrlq $26,%xmm11,%xmm1
+ vpand %xmm15,%xmm11,%xmm11
+ vpaddq %xmm1,%xmm12,%xmm12
+
+ vpaddq %xmm4,%xmm10,%xmm10
+ vpsllq $2,%xmm4,%xmm4
+ vpaddq %xmm4,%xmm10,%xmm10
+
+ vpsrlq $26,%xmm12,%xmm2
+ vpand %xmm15,%xmm12,%xmm12
+ vpaddq %xmm2,%xmm13,%xmm13
+
+ vpsrlq $26,%xmm10,%xmm0
+ vpand %xmm15,%xmm10,%xmm10
+ vpaddq %xmm0,%xmm11,%xmm11
+
+ vpsrlq $26,%xmm13,%xmm3
+ vpand %xmm15,%xmm13,%xmm13
+ vpaddq %xmm3,%xmm14,%xmm14
+
+ vmovd %xmm10,-112(%rdi)
+ vmovd %xmm11,-108(%rdi)
+ vmovd %xmm12,-104(%rdi)
+ vmovd %xmm13,-100(%rdi)
+ vmovd %xmm14,-96(%rdi)
+ leaq 88(%r11),%rsp
+.cfi_def_cfa %rsp,8
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type poly1305_emit_avx,@function
+.align 32
+poly1305_emit_avx:
+.cfi_startproc
+ cmpl $0,20(%rdi)
+ je .Lemit
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ecx
+ movl 8(%rdi),%r8d
+ movl 12(%rdi),%r11d
+ movl 16(%rdi),%r10d
+
+ shlq $26,%rcx
+ movq %r8,%r9
+ shlq $52,%r8
+ addq %rcx,%rax
+ shrq $12,%r9
+ addq %rax,%r8
+ adcq $0,%r9
+
+ shlq $14,%r11
+ movq %r10,%rax
+ shrq $24,%r10
+ addq %r11,%r9
+ shlq $40,%rax
+ addq %rax,%r9
+ adcq $0,%r10
+
+ movq %r10,%rax
+ movq %r10,%rcx
+ andq $3,%r10
+ shrq $2,%rax
+ andq $-4,%rcx
+ addq %rcx,%rax
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+
+ movq %r8,%rax
+ addq $5,%r8
+ movq %r9,%rcx
+ adcq $0,%r9
+ adcq $0,%r10
+ shrq $2,%r10
+ cmovnzq %r8,%rax
+ cmovnzq %r9,%rcx
+
+ addq 0(%rdx),%rax
+ adcq 8(%rdx),%rcx
+ movq %rax,0(%rsi)
+ movq %rcx,8(%rsi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly1305_emit_avx,.-poly1305_emit_avx
+.type poly1305_blocks_avx2,@function
+.align 32
+poly1305_blocks_avx2:
+.cfi_startproc
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2
+ testl %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2:
+ andq $-16,%rdx
+ jz .Lno_data_avx2
+
+ vzeroupper
+
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2
+
+ testq $63,%rdx
+ jz .Leven_avx2
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lblocks_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%ebp
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
+
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
+
+ movq %rbp,%r8
+ shlq $40,%r8
+ shrq $24,%rbp
+ addq %r8,%rbx
+ adcq $0,%rbp
+
+ movq $-4,%r9
+ movq %rbp,%r8
+ andq %rbp,%r9
+ shrq $2,%r8
+ andq $3,%rbp
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%rbp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+ subq $16,%r15
+
+ call __poly1305_block
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%rbp
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ jmp .Lproceed_avx2
+
+.align 32
+.Lstore_base2_64_avx2:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rbp,16(%rdi)
+ jmp .Ldone_avx2
+
+.align 16
+.Lstore_base2_26_avx2:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %ebp,16(%rdi)
+.align 16
+.Ldone_avx2:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx2:
+.cfi_startproc
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lbase2_64_avx2_body:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%ebp
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2
+
+.Lbase2_64_pre_avx2:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%rbp
+ subq $16,%r15
+
+ call __poly1305_block
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%rbp
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%rbp
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %ebp,%xmm4
+ movl $1,20(%rdi)
+
+ call __poly1305_init_avx
+
+.Lproceed_avx2:
+ movq %r15,%rdx
+ movl OPENSSL_ia32cap_P+8(%rip),%r10d
+ movl $3221291008,%r11d
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbp
+.cfi_restore %rbp
+ movq 40(%rsp),%rbx
+.cfi_restore %rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lbase2_64_avx2_epilogue:
+ jmp .Ldo_avx2
+.cfi_endproc
+
+.align 32
+.Leven_avx2:
+.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%r10d
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2:
+ leaq -8(%rsp),%r11
+.cfi_def_cfa %r11,16
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+
+
+
+
+
+
+
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+
+
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2
+
+.byte 0x66,0x90
+.Ltail_avx2:
+
+
+
+
+
+
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+
+
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+
+
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq 8(%r11),%rsp
+.cfi_def_cfa %rsp,8
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
+.align 64
+.Lconst:
+.Lmask24:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.Lmask26:
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long 2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad 0,12,24,64
+.L2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad 44,44,42,64
+.L2_44_shift_lft:
+.quad 8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
.globl xor128_encrypt_n_pad
diff --git a/secure/lib/libcrypto/amd64/rsaz-avx2.S b/secure/lib/libcrypto/amd64/rsaz-avx2.S
index e957915a7d81..3075a52d2eec 100644
--- a/secure/lib/libcrypto/amd64/rsaz-avx2.S
+++ b/secure/lib/libcrypto/amd64/rsaz-avx2.S
@@ -2,26 +2,1745 @@
/* Do not modify. This file is auto-generated from rsaz-avx2.pl. */
.text
-.globl rsaz_avx2_eligible
-.type rsaz_avx2_eligible,@function
-rsaz_avx2_eligible:
- xorl %eax,%eax
- .byte 0xf3,0xc3
-.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
-
.globl rsaz_1024_sqr_avx2
-.globl rsaz_1024_mul_avx2
-.globl rsaz_1024_norm2red_avx2
-.globl rsaz_1024_red2norm_avx2
-.globl rsaz_1024_scatter5_avx2
-.globl rsaz_1024_gather5_avx2
.type rsaz_1024_sqr_avx2,@function
+.align 64
rsaz_1024_sqr_avx2:
+.cfi_startproc
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ movq %rdx,%r13
+ subq $832,%rsp
+ movq %r13,%r15
+ subq $-128,%rdi
+ subq $-128,%rsi
+ subq $-128,%r13
+
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ vpxor %ymm9,%ymm9,%ymm9
+ jz .Lsqr_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%r13),%ymm0
+ andq $-2048,%rsp
+ vmovdqu 32-128(%r13),%ymm1
+ vmovdqu 64-128(%r13),%ymm2
+ vmovdqu 96-128(%r13),%ymm3
+ vmovdqu 128-128(%r13),%ymm4
+ vmovdqu 160-128(%r13),%ymm5
+ vmovdqu 192-128(%r13),%ymm6
+ vmovdqu 224-128(%r13),%ymm7
+ vmovdqu 256-128(%r13),%ymm8
+ leaq 832+128(%rsp),%r13
+ vmovdqu %ymm0,0-128(%r13)
+ vmovdqu %ymm1,32-128(%r13)
+ vmovdqu %ymm2,64-128(%r13)
+ vmovdqu %ymm3,96-128(%r13)
+ vmovdqu %ymm4,128-128(%r13)
+ vmovdqu %ymm5,160-128(%r13)
+ vmovdqu %ymm6,192-128(%r13)
+ vmovdqu %ymm7,224-128(%r13)
+ vmovdqu %ymm8,256-128(%r13)
+ vmovdqu %ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+ andq $-1024,%rsp
+
+ vmovdqu 32-128(%rsi),%ymm1
+ vmovdqu 64-128(%rsi),%ymm2
+ vmovdqu 96-128(%rsi),%ymm3
+ vmovdqu 128-128(%rsi),%ymm4
+ vmovdqu 160-128(%rsi),%ymm5
+ vmovdqu 192-128(%rsi),%ymm6
+ vmovdqu 224-128(%rsi),%ymm7
+ vmovdqu 256-128(%rsi),%ymm8
+
+ leaq 192(%rsp),%rbx
+ vmovdqu .Land_mask(%rip),%ymm15
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ leaq 576+128(%rsp),%r9
+ leaq 448(%rsp),%r12
+
+
+
+
+ vpaddq %ymm1,%ymm1,%ymm1
+ vpbroadcastq 0-128(%rsi),%ymm10
+ vpaddq %ymm2,%ymm2,%ymm2
+ vmovdqa %ymm1,0-128(%r9)
+ vpaddq %ymm3,%ymm3,%ymm3
+ vmovdqa %ymm2,32-128(%r9)
+ vpaddq %ymm4,%ymm4,%ymm4
+ vmovdqa %ymm3,64-128(%r9)
+ vpaddq %ymm5,%ymm5,%ymm5
+ vmovdqa %ymm4,96-128(%r9)
+ vpaddq %ymm6,%ymm6,%ymm6
+ vmovdqa %ymm5,128-128(%r9)
+ vpaddq %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm6,160-128(%r9)
+ vpaddq %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm7,192-128(%r9)
+ vpxor %ymm9,%ymm9,%ymm9
+ vmovdqa %ymm8,224-128(%r9)
+
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpbroadcastq 32-128(%rsi),%ymm11
+ vmovdqu %ymm9,288-192(%rbx)
+ vpmuludq %ymm10,%ymm1,%ymm1
+ vmovdqu %ymm9,320-448(%r12)
+ vpmuludq %ymm10,%ymm2,%ymm2
+ vmovdqu %ymm9,352-448(%r12)
+ vpmuludq %ymm10,%ymm3,%ymm3
+ vmovdqu %ymm9,384-448(%r12)
+ vpmuludq %ymm10,%ymm4,%ymm4
+ vmovdqu %ymm9,416-448(%r12)
+ vpmuludq %ymm10,%ymm5,%ymm5
+ vmovdqu %ymm9,448-448(%r12)
+ vpmuludq %ymm10,%ymm6,%ymm6
+ vmovdqu %ymm9,480-448(%r12)
+ vpmuludq %ymm10,%ymm7,%ymm7
+ vmovdqu %ymm9,512-448(%r12)
+ vpmuludq %ymm10,%ymm8,%ymm8
+ vpbroadcastq 64-128(%rsi),%ymm10
+ vmovdqu %ymm9,544-448(%r12)
+
+ movq %rsi,%r15
+ movl $4,%r14d
+ jmp .Lsqr_entry_1024
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32-128(%r15),%ymm11
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpaddq 0-192(%rbx),%ymm0,%ymm0
+ vpmuludq 0-128(%r9),%ymm10,%ymm1
+ vpaddq 32-192(%rbx),%ymm1,%ymm1
+ vpmuludq 32-128(%r9),%ymm10,%ymm2
+ vpaddq 64-192(%rbx),%ymm2,%ymm2
+ vpmuludq 64-128(%r9),%ymm10,%ymm3
+ vpaddq 96-192(%rbx),%ymm3,%ymm3
+ vpmuludq 96-128(%r9),%ymm10,%ymm4
+ vpaddq 128-192(%rbx),%ymm4,%ymm4
+ vpmuludq 128-128(%r9),%ymm10,%ymm5
+ vpaddq 160-192(%rbx),%ymm5,%ymm5
+ vpmuludq 160-128(%r9),%ymm10,%ymm6
+ vpaddq 192-192(%rbx),%ymm6,%ymm6
+ vpmuludq 192-128(%r9),%ymm10,%ymm7
+ vpaddq 224-192(%rbx),%ymm7,%ymm7
+ vpmuludq 224-128(%r9),%ymm10,%ymm8
+ vpbroadcastq 64-128(%r15),%ymm10
+ vpaddq 256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+ vmovdqu %ymm0,0-192(%rbx)
+ vmovdqu %ymm1,32-192(%rbx)
+
+ vpmuludq 32-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 32-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 64-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 96-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 128-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 160-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 192-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 224-128(%r9),%ymm11,%ymm0
+ vpbroadcastq 96-128(%r15),%ymm11
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+
+ vmovdqu %ymm2,64-192(%rbx)
+ vmovdqu %ymm3,96-192(%rbx)
+
+ vpmuludq 64-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 64-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 96-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 128-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 160-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 224-128(%r9),%ymm10,%ymm1
+ vpbroadcastq 128-128(%r15),%ymm10
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+
+ vmovdqu %ymm4,128-192(%rbx)
+ vmovdqu %ymm5,160-192(%rbx)
+
+ vpmuludq 96-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 96-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq 128-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm0,%ymm0
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq 224-128(%r9),%ymm11,%ymm2
+ vpbroadcastq 160-128(%r15),%ymm11
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+
+ vmovdqu %ymm6,192-192(%rbx)
+ vmovdqu %ymm7,224-192(%rbx)
+
+ vpmuludq 128-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 128-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 160-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 192-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 224-128(%r9),%ymm10,%ymm3
+ vpbroadcastq 192-128(%r15),%ymm10
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+
+ vmovdqu %ymm8,256-192(%rbx)
+ vmovdqu %ymm0,288-192(%rbx)
+ leaq 8(%rbx),%rbx
+
+ vpmuludq 160-128(%rsi),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 224-128(%r9),%ymm11,%ymm4
+ vpbroadcastq 224-128(%r15),%ymm11
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+
+ vmovdqu %ymm1,320-448(%r12)
+ vmovdqu %ymm2,352-448(%r12)
+
+ vpmuludq 192-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpbroadcastq 256-128(%r15),%ymm0
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq 224-128(%r9),%ymm10,%ymm5
+ vpbroadcastq 0+8-128(%r15),%ymm10
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+
+ vmovdqu %ymm3,384-448(%r12)
+ vmovdqu %ymm4,416-448(%r12)
+ leaq 8(%r15),%r15
+
+ vpmuludq 224-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 224-128(%r9),%ymm11,%ymm6
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+
+ vpmuludq 256-128(%rsi),%ymm0,%ymm7
+ vmovdqu %ymm5,448-448(%r12)
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vmovdqu %ymm6,480-448(%r12)
+ vmovdqu %ymm7,512-448(%r12)
+ leaq 8(%r12),%r12
+
+ decl %r14d
+ jnz .LOOP_SQR_1024
+
+ vmovdqu 256(%rsp),%ymm8
+ vmovdqu 288(%rsp),%ymm1
+ vmovdqu 320(%rsp),%ymm2
+ leaq 192(%rsp),%rbx
+
+ vpsrlq $29,%ymm8,%ymm14
+ vpand %ymm15,%ymm8,%ymm8
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+
+ vpermq $0x93,%ymm14,%ymm14
+ vpxor %ymm9,%ymm9,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpblendd $3,%ymm11,%ymm9,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,288-192(%rbx)
+ vmovdqu %ymm2,320-192(%rbx)
+
+ movq (%rsp),%rax
+ movq 8(%rsp),%r10
+ movq 16(%rsp),%r11
+ movq 24(%rsp),%r12
+ vmovdqu 32(%rsp),%ymm1
+ vmovdqu 64-192(%rbx),%ymm2
+ vmovdqu 96-192(%rbx),%ymm3
+ vmovdqu 128-192(%rbx),%ymm4
+ vmovdqu 160-192(%rbx),%ymm5
+ vmovdqu 192-192(%rbx),%ymm6
+ vmovdqu 224-192(%rbx),%ymm7
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpbroadcastq %xmm12,%ymm12
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ shrq $29,%r9
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ addq %r9,%r10
+ addq %rax,%r11
+ imulq 24-128(%r13),%rdx
+ addq %rdx,%r12
+
+ movq %r10,%rax
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+
+ movl $9,%r14d
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax,%xmm13
+ vpbroadcastq %xmm13,%ymm13
+
+ vpmuludq 32-128(%r13),%ymm12,%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm10,%ymm1,%ymm1
+ addq %rax,%r10
+ vpmuludq 64-128(%r13),%ymm12,%ymm14
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm14,%ymm2,%ymm2
+ vpmuludq 96-128(%r13),%ymm12,%ymm11
+.byte 0x67
+ addq %rax,%r11
+.byte 0x67
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ shrq $29,%r10
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpmuludq 128-128(%r13),%ymm12,%ymm10
+ addq %rax,%r12
+ addq %r10,%r11
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpmuludq 160-128(%r13),%ymm12,%ymm14
+ movq %r11,%rax
+ imull %ecx,%eax
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpmuludq 192-128(%r13),%ymm12,%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpmuludq 224-128(%r13),%ymm12,%ymm10
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpmuludq 256-128(%r13),%ymm12,%ymm14
+ vmovd %eax,%xmm12
+
+ vpaddq %ymm14,%ymm8,%ymm8
+
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 32-8-128(%r13),%ymm13,%ymm11
+ vmovdqu 96-8-128(%r13),%ymm14
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpmuludq 64-8-128(%r13),%ymm13,%ymm10
+ vmovdqu 128-8-128(%r13),%ymm11
+ addq %rax,%r11
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm10,%ymm2,%ymm2
+ addq %r12,%rax
+ shrq $29,%r11
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 160-8-128(%r13),%ymm10
+ addq %r11,%rax
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 192-8-128(%r13),%ymm14
+.byte 0x67
+ movq %rax,%r12
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpmuludq %ymm13,%ymm10,%ymm10
+.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm5,%ymm5
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 256-8-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 288-8-128(%r13),%ymm9
+ vmovd %eax,%xmm0
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpmuludq %ymm13,%ymm10,%ymm10
+ vmovdqu 32-16-128(%r13),%ymm14
+ vpbroadcastq %xmm0,%ymm0
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpmuludq %ymm13,%ymm9,%ymm9
+ vmovdqu 64-16-128(%r13),%ymm11
+ addq %rax,%r12
+
+ vmovdqu 32-24-128(%r13),%ymm13
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 96-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq %ymm0,%ymm13,%ymm13
+ vpmuludq %ymm12,%ymm11,%ymm11
+.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+ vpaddq %ymm1,%ymm13,%ymm13
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 160-16-128(%r13),%ymm11
+.byte 0x67
+ vmovq %xmm13,%rax
+ vmovdqu %ymm13,(%rsp)
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 192-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq %ymm12,%ymm11,%ymm11
+ vmovdqu 224-16-128(%r13),%ymm14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 256-16-128(%r13),%ymm11
+ vpaddq %ymm10,%ymm6,%ymm6
+ vpmuludq %ymm12,%ymm14,%ymm14
+ shrq $29,%r12
+ vmovdqu 288-16-128(%r13),%ymm10
+ addq %r12,%rax
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq %ymm12,%ymm11,%ymm11
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm8,%ymm8
+ vpmuludq %ymm12,%ymm10,%ymm10
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+ vmovdqu 96-24-128(%r13),%ymm11
+.byte 0x67
+ vpaddq %ymm10,%ymm9,%ymm9
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 64-24-128(%r13),%ymm0,%ymm14
+ vmovdqu 128-24-128(%r13),%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ movq 8(%rsp),%r10
+ vpaddq %ymm14,%ymm2,%ymm1
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 160-24-128(%r13),%ymm14
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+.byte 0x67
+ shrq $29,%r9
+ movq 16(%rsp),%r11
+ vpaddq %ymm11,%ymm3,%ymm2
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vmovdqu 192-24-128(%r13),%ymm11
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ vpaddq %ymm10,%ymm4,%ymm3
+ vpmuludq %ymm0,%ymm14,%ymm14
+ vmovdqu 224-24-128(%r13),%ymm10
+ imulq 24-128(%r13),%rdx
+ addq %rax,%r11
+ leaq (%r9,%r10,1),%rax
+ vpaddq %ymm14,%ymm5,%ymm4
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 256-24-128(%r13),%ymm14
+ movq %rax,%r10
+ imull %ecx,%eax
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vpaddq %ymm11,%ymm6,%ymm5
+ vmovdqu 288-24-128(%r13),%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm7,%ymm6
+ vpmuludq %ymm0,%ymm14,%ymm14
+ addq 24(%rsp),%rdx
+ vpaddq %ymm14,%ymm8,%ymm7
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vpaddq %ymm11,%ymm9,%ymm8
+ vmovq %r12,%xmm9
+ movq %rdx,%r12
+
+ decl %r14d
+ jnz .LOOP_REDUCE_1024
+ leaq 448(%rsp),%r12
+ vpaddq %ymm9,%ymm13,%ymm0
+ vpxor %ymm9,%ymm9,%ymm9
+
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vpaddq 544-448(%r12),%ymm8,%ymm8
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vmovdqu %ymm0,0-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,32-128(%rdi)
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vmovdqu %ymm2,64-128(%rdi)
+ vpaddq %ymm13,%ymm4,%ymm4
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vmovdqu %ymm4,128-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vmovdqu %ymm5,160-128(%rdi)
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vmovdqu %ymm6,192-128(%rdi)
+ vpaddq %ymm13,%ymm8,%ymm8
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+
+ movq %rdi,%rsi
+ decl %r8d
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lsqr_1024_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,@function
+.align 64
rsaz_1024_mul_avx2:
-rsaz_1024_norm2red_avx2:
+.cfi_startproc
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ vzeroall
+ movq %rdx,%r13
+ subq $64,%rsp
+
+
+
+
+
+
+.byte 0x67,0x67
+ movq %rsi,%r15
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ movq %rsi,%r15
+ cmovnzq %r13,%rsi
+ cmovnzq %r15,%r13
+
+ movq %rcx,%r15
+ subq $-128,%rsi
+ subq $-128,%rcx
+ subq $-128,%rdi
+
+ andq $4095,%r15
+ addq $320,%r15
+.byte 0x67,0x67
+ shrq $12,%r15
+ jz .Lmul_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%rcx),%ymm0
+ andq $-512,%rsp
+ vmovdqu 32-128(%rcx),%ymm1
+ vmovdqu 64-128(%rcx),%ymm2
+ vmovdqu 96-128(%rcx),%ymm3
+ vmovdqu 128-128(%rcx),%ymm4
+ vmovdqu 160-128(%rcx),%ymm5
+ vmovdqu 192-128(%rcx),%ymm6
+ vmovdqu 224-128(%rcx),%ymm7
+ vmovdqu 256-128(%rcx),%ymm8
+ leaq 64+128(%rsp),%rcx
+ vmovdqu %ymm0,0-128(%rcx)
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm1,32-128(%rcx)
+ vpxor %ymm1,%ymm1,%ymm1
+ vmovdqu %ymm2,64-128(%rcx)
+ vpxor %ymm2,%ymm2,%ymm2
+ vmovdqu %ymm3,96-128(%rcx)
+ vpxor %ymm3,%ymm3,%ymm3
+ vmovdqu %ymm4,128-128(%rcx)
+ vpxor %ymm4,%ymm4,%ymm4
+ vmovdqu %ymm5,160-128(%rcx)
+ vpxor %ymm5,%ymm5,%ymm5
+ vmovdqu %ymm6,192-128(%rcx)
+ vpxor %ymm6,%ymm6,%ymm6
+ vmovdqu %ymm7,224-128(%rcx)
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqu %ymm8,256-128(%rcx)
+ vmovdqa %ymm0,%ymm8
+ vmovdqu %ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+ andq $-64,%rsp
+
+ movq (%r13),%rbx
+ vpbroadcastq (%r13),%ymm10
+ vmovdqu %ymm0,(%rsp)
+ xorq %r9,%r9
+.byte 0x67
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+
+ vmovdqu .Land_mask(%rip),%ymm15
+ movl $9,%r14d
+ vmovdqu %ymm9,288-128(%rdi)
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq $29,%ymm3,%ymm9
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r9,%rax
+ movq %rbx,%r10
+ imulq 8-128(%rsi),%r10
+ addq 8(%rsp),%r10
+
+ movq %rax,%r9
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ movq %rbx,%r11
+ imulq 16-128(%rsi),%r11
+ addq 16(%rsp),%r11
+
+ movq %rbx,%r12
+ imulq 24-128(%rsi),%r12
+ addq 24(%rsp),%r12
+ vpmuludq 32-128(%rsi),%ymm10,%ymm0
+ vmovd %eax,%xmm11
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq 64-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 96-128(%rsi),%ymm10,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq 128-128(%rsi),%ymm10,%ymm0
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq 160-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 192-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq 224-128(%rsi),%ymm10,%ymm0
+ vpermq $0x93,%ymm9,%ymm9
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq 256-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq 8(%r13),%ymm10
+ vpaddq %ymm12,%ymm8,%ymm8
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%rcx),%rax
+ addq %rax,%r11
+ shrq $29,%r9
+ imulq 24-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r9,%r10
+
+ vpmuludq 32-128(%rcx),%ymm11,%ymm13
+ vmovq %xmm10,%rbx
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 64-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm2,%ymm2
+ vpmuludq 96-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 128-128(%rcx),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 160-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm5,%ymm5
+ vpmuludq 192-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 224-128(%rcx),%ymm11,%ymm13
+ vpblendd $3,%ymm14,%ymm9,%ymm12
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 256-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm0,%ymm8,%ymm8
+
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rsi),%ymm12
+ movq %rbx,%rax
+ imulq 8-128(%rsi),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rsi),%ymm13
+
+ movq %r10,%rax
+ vpblendd $0xfc,%ymm14,%ymm9,%ymm9
+ imull %r8d,%eax
+ vpaddq %ymm9,%ymm4,%ymm4
+ andl $0x1fffffff,%eax
+
+ imulq 16-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovd %eax,%xmm11
+ vmovdqu -8+96-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -8+128-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+160-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+192-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -8+224-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+256-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+288-128(%rsi),%ymm9
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm9,%ymm9
+ vpbroadcastq 16(%r13),%ymm10
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rcx),%ymm0
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rcx),%ymm12
+ shrq $29,%r10
+ imulq 16-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r10,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -8+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rsi),%ymm0
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r11,%rax
+
+ vmovdqu -16+64-128(%rsi),%ymm12
+ movq %rax,%r11
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ imulq 8-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -16+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -16+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 24(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rcx),%ymm0
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -16+64-128(%rcx),%ymm12
+ imulq 8-128(%rcx),%rdx
+ addq %rdx,%r12
+ shrq $29,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -16+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+32-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+64-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ addq %r11,%r12
+ imulq -128(%rsi),%rbx
+ addq %rbx,%r12
+
+ movq %r12,%rax
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -24+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -24+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 32(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+ addq $32,%r13
+
+ vmovdqu -24+32-128(%rcx),%ymm0
+ imulq -128(%rcx),%rax
+ addq %rax,%r12
+ shrq $29,%r12
+
+ vmovdqu -24+64-128(%rcx),%ymm12
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -24+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm0
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu %ymm0,(%rsp)
+ vpaddq %ymm12,%ymm2,%ymm1
+ vmovdqu -24+128-128(%rcx),%ymm0
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm2
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm3
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm4
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm5
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rcx),%ymm13
+ movq %r12,%r9
+ vpaddq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm11,%ymm12,%ymm12
+ addq (%rsp),%r9
+ vpaddq %ymm12,%ymm8,%ymm7
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovq %r12,%xmm12
+ vpaddq %ymm13,%ymm9,%ymm8
+
+ decl %r14d
+ jnz .Loop_mul_1024
+ vpaddq (%rsp),%ymm12,%ymm0
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm10,%ymm10
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpermq $0x93,%ymm11,%ymm11
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm10,%ymm10
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vmovdqu %ymm0,0-128(%rdi)
+ vmovdqu %ymm1,32-128(%rdi)
+ vmovdqu %ymm2,64-128(%rdi)
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vmovdqu %ymm4,128-128(%rdi)
+ vmovdqu %ymm5,160-128(%rdi)
+ vmovdqu %ymm6,192-128(%rdi)
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+ vzeroupper
+
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_1024_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,@function
+.align 32
rsaz_1024_red2norm_avx2:
+.cfi_startproc
+ subq $-128,%rsi
+ xorq %rax,%rax
+ movq -128(%rsi),%r8
+ movq -120(%rsi),%r9
+ movq -112(%rsi),%r10
+ shlq $0,%r8
+ shlq $29,%r9
+ movq %r10,%r11
+ shlq $58,%r10
+ shrq $6,%r11
+ addq %r8,%rax
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,0(%rdi)
+ movq %r11,%rax
+ movq -104(%rsi),%r8
+ movq -96(%rsi),%r9
+ shlq $23,%r8
+ movq %r9,%r10
+ shlq $52,%r9
+ shrq $12,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,8(%rdi)
+ movq %r10,%rax
+ movq -88(%rsi),%r11
+ movq -80(%rsi),%r8
+ shlq $17,%r11
+ movq %r8,%r9
+ shlq $46,%r8
+ shrq $18,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,16(%rdi)
+ movq %r9,%rax
+ movq -72(%rsi),%r10
+ movq -64(%rsi),%r11
+ shlq $11,%r10
+ movq %r11,%r8
+ shlq $40,%r11
+ shrq $24,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,24(%rdi)
+ movq %r8,%rax
+ movq -56(%rsi),%r9
+ movq -48(%rsi),%r10
+ movq -40(%rsi),%r11
+ shlq $5,%r9
+ shlq $34,%r10
+ movq %r11,%r8
+ shlq $63,%r11
+ shrq $1,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,32(%rdi)
+ movq %r8,%rax
+ movq -32(%rsi),%r9
+ movq -24(%rsi),%r10
+ shlq $28,%r9
+ movq %r10,%r11
+ shlq $57,%r10
+ shrq $7,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,40(%rdi)
+ movq %r11,%rax
+ movq -16(%rsi),%r8
+ movq -8(%rsi),%r9
+ shlq $22,%r8
+ movq %r9,%r10
+ shlq $51,%r9
+ shrq $13,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,48(%rdi)
+ movq %r10,%rax
+ movq 0(%rsi),%r11
+ movq 8(%rsi),%r8
+ shlq $16,%r11
+ movq %r8,%r9
+ shlq $45,%r8
+ shrq $19,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,56(%rdi)
+ movq %r9,%rax
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ shlq $10,%r10
+ movq %r11,%r8
+ shlq $39,%r11
+ shrq $25,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,64(%rdi)
+ movq %r8,%rax
+ movq 32(%rsi),%r9
+ movq 40(%rsi),%r10
+ movq 48(%rsi),%r11
+ shlq $4,%r9
+ shlq $33,%r10
+ movq %r11,%r8
+ shlq $62,%r11
+ shrq $2,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,72(%rdi)
+ movq %r8,%rax
+ movq 56(%rsi),%r9
+ movq 64(%rsi),%r10
+ shlq $27,%r9
+ movq %r10,%r11
+ shlq $56,%r10
+ shrq $8,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,80(%rdi)
+ movq %r11,%rax
+ movq 72(%rsi),%r8
+ movq 80(%rsi),%r9
+ shlq $21,%r8
+ movq %r9,%r10
+ shlq $50,%r9
+ shrq $14,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,88(%rdi)
+ movq %r10,%rax
+ movq 88(%rsi),%r11
+ movq 96(%rsi),%r8
+ shlq $15,%r11
+ movq %r8,%r9
+ shlq $44,%r8
+ shrq $20,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,96(%rdi)
+ movq %r9,%rax
+ movq 104(%rsi),%r10
+ movq 112(%rsi),%r11
+ shlq $9,%r10
+ movq %r11,%r8
+ shlq $38,%r11
+ shrq $26,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,104(%rdi)
+ movq %r8,%rax
+ movq 120(%rsi),%r9
+ movq 128(%rsi),%r10
+ movq 136(%rsi),%r11
+ shlq $3,%r9
+ shlq $32,%r10
+ movq %r11,%r8
+ shlq $61,%r11
+ shrq $3,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,112(%rdi)
+ movq %r8,%rax
+ movq 144(%rsi),%r9
+ movq 152(%rsi),%r10
+ shlq $26,%r9
+ movq %r10,%r11
+ shlq $55,%r10
+ shrq $9,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,120(%rdi)
+ movq %r11,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,@function
+.align 32
+rsaz_1024_norm2red_avx2:
+.cfi_startproc
+ subq $-128,%rdi
+ movq (%rsi),%r8
+ movl $0x1fffffff,%eax
+ movq 8(%rsi),%r9
+ movq %r8,%r11
+ shrq $0,%r11
+ andq %rax,%r11
+ movq %r11,-128(%rdi)
+ movq %r8,%r10
+ shrq $29,%r10
+ andq %rax,%r10
+ movq %r10,-120(%rdi)
+ shrdq $58,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-112(%rdi)
+ movq 16(%rsi),%r10
+ movq %r9,%r8
+ shrq $23,%r8
+ andq %rax,%r8
+ movq %r8,-104(%rdi)
+ shrdq $52,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-96(%rdi)
+ movq 24(%rsi),%r11
+ movq %r10,%r9
+ shrq $17,%r9
+ andq %rax,%r9
+ movq %r9,-88(%rdi)
+ shrdq $46,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-80(%rdi)
+ movq 32(%rsi),%r8
+ movq %r11,%r10
+ shrq $11,%r10
+ andq %rax,%r10
+ movq %r10,-72(%rdi)
+ shrdq $40,%r8,%r11
+ andq %rax,%r11
+ movq %r11,-64(%rdi)
+ movq 40(%rsi),%r9
+ movq %r8,%r11
+ shrq $5,%r11
+ andq %rax,%r11
+ movq %r11,-56(%rdi)
+ movq %r8,%r10
+ shrq $34,%r10
+ andq %rax,%r10
+ movq %r10,-48(%rdi)
+ shrdq $63,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-40(%rdi)
+ movq 48(%rsi),%r10
+ movq %r9,%r8
+ shrq $28,%r8
+ andq %rax,%r8
+ movq %r8,-32(%rdi)
+ shrdq $57,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-24(%rdi)
+ movq 56(%rsi),%r11
+ movq %r10,%r9
+ shrq $22,%r9
+ andq %rax,%r9
+ movq %r9,-16(%rdi)
+ shrdq $51,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-8(%rdi)
+ movq 64(%rsi),%r8
+ movq %r11,%r10
+ shrq $16,%r10
+ andq %rax,%r10
+ movq %r10,0(%rdi)
+ shrdq $45,%r8,%r11
+ andq %rax,%r11
+ movq %r11,8(%rdi)
+ movq 72(%rsi),%r9
+ movq %r8,%r11
+ shrq $10,%r11
+ andq %rax,%r11
+ movq %r11,16(%rdi)
+ shrdq $39,%r9,%r8
+ andq %rax,%r8
+ movq %r8,24(%rdi)
+ movq 80(%rsi),%r10
+ movq %r9,%r8
+ shrq $4,%r8
+ andq %rax,%r8
+ movq %r8,32(%rdi)
+ movq %r9,%r11
+ shrq $33,%r11
+ andq %rax,%r11
+ movq %r11,40(%rdi)
+ shrdq $62,%r10,%r9
+ andq %rax,%r9
+ movq %r9,48(%rdi)
+ movq 88(%rsi),%r11
+ movq %r10,%r9
+ shrq $27,%r9
+ andq %rax,%r9
+ movq %r9,56(%rdi)
+ shrdq $56,%r11,%r10
+ andq %rax,%r10
+ movq %r10,64(%rdi)
+ movq 96(%rsi),%r8
+ movq %r11,%r10
+ shrq $21,%r10
+ andq %rax,%r10
+ movq %r10,72(%rdi)
+ shrdq $50,%r8,%r11
+ andq %rax,%r11
+ movq %r11,80(%rdi)
+ movq 104(%rsi),%r9
+ movq %r8,%r11
+ shrq $15,%r11
+ andq %rax,%r11
+ movq %r11,88(%rdi)
+ shrdq $44,%r9,%r8
+ andq %rax,%r8
+ movq %r8,96(%rdi)
+ movq 112(%rsi),%r10
+ movq %r9,%r8
+ shrq $9,%r8
+ andq %rax,%r8
+ movq %r8,104(%rdi)
+ shrdq $38,%r10,%r9
+ andq %rax,%r9
+ movq %r9,112(%rdi)
+ movq 120(%rsi),%r11
+ movq %r10,%r9
+ shrq $3,%r9
+ andq %rax,%r9
+ movq %r9,120(%rdi)
+ movq %r10,%r8
+ shrq $32,%r8
+ andq %rax,%r8
+ movq %r8,128(%rdi)
+ shrdq $61,%r11,%r10
+ andq %rax,%r10
+ movq %r10,136(%rdi)
+ xorq %r8,%r8
+ movq %r11,%r10
+ shrq $26,%r10
+ andq %rax,%r10
+ movq %r10,144(%rdi)
+ shrdq $55,%r8,%r11
+ andq %rax,%r11
+ movq %r11,152(%rdi)
+ movq %r8,160(%rdi)
+ movq %r8,168(%rdi)
+ movq %r8,176(%rdi)
+ movq %r8,184(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,@function
+.align 32
rsaz_1024_scatter5_avx2:
+.cfi_startproc
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shll $4,%edx
+ leaq (%rdi,%rdx,1),%rdi
+ movl $9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu (%rsi),%ymm0
+ leaq 32(%rsi),%rsi
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,(%rdi)
+ leaq 512(%rdi),%rdi
+ decl %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,@function
+.align 32
rsaz_1024_gather5_avx2:
-.byte 0x0f,0x0b
+.cfi_startproc
+ vzeroupper
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ leaq -256(%rsp),%rsp
+ andq $-32,%rsp
+ leaq .Linc(%rip),%r10
+ leaq -128(%rsp),%rax
+
+ vmovd %edx,%xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,0+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm0
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,32+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm1
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,64+128(%rax)
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vmovdqa %ymm3,96+128(%rax)
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,128+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm8
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,160+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm9
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,192+128(%rax)
+ vpaddd %ymm5,%ymm8,%ymm10
+ vpcmpeqd %ymm4,%ymm8,%ymm8
+ vmovdqa %ymm3,224+128(%rax)
+ vpaddd %ymm5,%ymm9,%ymm11
+ vpcmpeqd %ymm4,%ymm9,%ymm9
+ vpaddd %ymm5,%ymm10,%ymm12
+ vpcmpeqd %ymm4,%ymm10,%ymm10
+ vpaddd %ymm5,%ymm11,%ymm13
+ vpcmpeqd %ymm4,%ymm11,%ymm11
+ vpaddd %ymm5,%ymm12,%ymm14
+ vpcmpeqd %ymm4,%ymm12,%ymm12
+ vpaddd %ymm5,%ymm13,%ymm15
+ vpcmpeqd %ymm4,%ymm13,%ymm13
+ vpcmpeqd %ymm4,%ymm14,%ymm14
+ vpcmpeqd %ymm4,%ymm15,%ymm15
+
+ vmovdqa -32(%r10),%ymm7
+ leaq 128(%rsi),%rsi
+ movl $9,%edx
+
+.Loop_gather_1024:
+ vmovdqa 0-128(%rsi),%ymm0
+ vmovdqa 32-128(%rsi),%ymm1
+ vmovdqa 64-128(%rsi),%ymm2
+ vmovdqa 96-128(%rsi),%ymm3
+ vpand 0+128(%rax),%ymm0,%ymm0
+ vpand 32+128(%rax),%ymm1,%ymm1
+ vpand 64+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm1,%ymm4
+ vpand 96+128(%rax),%ymm3,%ymm3
+ vmovdqa 128-128(%rsi),%ymm0
+ vmovdqa 160-128(%rsi),%ymm1
+ vpor %ymm2,%ymm3,%ymm5
+ vmovdqa 192-128(%rsi),%ymm2
+ vmovdqa 224-128(%rsi),%ymm3
+ vpand 128+128(%rax),%ymm0,%ymm0
+ vpand 160+128(%rax),%ymm1,%ymm1
+ vpand 192+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 224+128(%rax),%ymm3,%ymm3
+ vpand 256-128(%rsi),%ymm8,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 288-128(%rsi),%ymm9,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 320-128(%rsi),%ymm10,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 352-128(%rsi),%ymm11,%ymm3
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 384-128(%rsi),%ymm12,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 416-128(%rsi),%ymm13,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 448-128(%rsi),%ymm14,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 480-128(%rsi),%ymm15,%ymm3
+ leaq 512(%rsi),%rsi
+ vpor %ymm0,%ymm4,%ymm4
+ vpor %ymm1,%ymm5,%ymm5
+ vpor %ymm2,%ymm4,%ymm4
+ vpor %ymm3,%ymm5,%ymm5
+
+ vpor %ymm5,%ymm4,%ymm4
+ vextracti128 $1,%ymm4,%xmm5
+ vpor %xmm4,%xmm5,%xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,(%rdi)
+ leaq 32(%rdi),%rdi
+ decl %edx
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ vzeroupper
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
.byte 0xf3,0xc3
-.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.cfi_endproc
+.LSEH_end_rsaz_1024_gather5:
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+
+.globl rsaz_avx2_eligible
+.type rsaz_avx2_eligible,@function
+.align 32
+rsaz_avx2_eligible:
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ movl $524544,%ecx
+ movl $0,%edx
+ andl %eax,%ecx
+ cmpl $524544,%ecx
+ cmovel %edx,%eax
+ andl $32,%eax
+ shrl $5,%eax
+ .byte 0xf3,0xc3
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align 64
+.Land_mask:
+.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long 0,7,1,7,2,7,3,7
+.Linc:
+.long 0,0,0,0, 1,1,1,1
+.long 2,2,2,2, 3,3,3,3
+.long 4,4,4,4, 4,4,4,4
+.align 64
diff --git a/secure/lib/libcrypto/amd64/rsaz-x86_64.S b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
index ae64f7a73987..3ba29ea52dd9 100644
--- a/secure/lib/libcrypto/amd64/rsaz-x86_64.S
+++ b/secure/lib/libcrypto/amd64/rsaz-x86_64.S
@@ -35,6 +35,10 @@ rsaz_512_sqr:
movq (%rsi),%rdx
movq 8(%rsi),%rax
movq %rcx,128(%rsp)
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Loop_sqrx
jmp .Loop_sqr
.align 32
@@ -405,6 +409,282 @@ rsaz_512_sqr:
decl %r8d
jnz .Loop_sqr
+ jmp .Lsqr_tail
+
+.align 32
+.Loop_sqrx:
+ movl %r8d,128+8(%rsp)
+.byte 102,72,15,110,199
+
+ mulxq %rax,%r8,%r9
+ movq %rax,%rbx
+
+ mulxq 16(%rsi),%rcx,%r10
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rcx,%r9
+
+.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
+ adcxq %rax,%r10
+
+.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
+ adcxq %rcx,%r11
+
+ mulxq 48(%rsi),%rcx,%r14
+ adcxq %rax,%r12
+ adcxq %rcx,%r13
+
+ mulxq 56(%rsi),%rax,%r15
+ adcxq %rax,%r14
+ adcxq %rbp,%r15
+
+ mulxq %rdx,%rax,%rdi
+ movq %rbx,%rdx
+ xorq %rcx,%rcx
+ adoxq %r8,%r8
+ adcxq %rdi,%r8
+ adoxq %rbp,%rcx
+ adcxq %rbp,%rcx
+
+ movq %rax,(%rsp)
+ movq %r8,8(%rsp)
+
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
+ adoxq %rax,%r10
+ adcxq %rbx,%r11
+
+ mulxq 24(%rsi),%rdi,%r8
+ adoxq %rdi,%r11
+.byte 0x66
+ adcxq %r8,%r12
+
+ mulxq 32(%rsi),%rax,%rbx
+ adoxq %rax,%r12
+ adcxq %rbx,%r13
+
+ mulxq 40(%rsi),%rdi,%r8
+ adoxq %rdi,%r13
+ adcxq %r8,%r14
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adoxq %rax,%r14
+ adcxq %rbx,%r15
+
+.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
+ adoxq %rdi,%r15
+ adcxq %rbp,%r8
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r8
+.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00
+
+ xorq %rbx,%rbx
+ adoxq %r9,%r9
+
+ adcxq %rcx,%rax
+ adoxq %r10,%r10
+ adcxq %rax,%r9
+ adoxq %rbp,%rbx
+ adcxq %rdi,%r10
+ adcxq %rbp,%rbx
+
+ movq %r9,16(%rsp)
+.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
+
+
+ mulxq 24(%rsi),%rdi,%r9
+ adoxq %rdi,%r12
+ adcxq %r9,%r13
+
+ mulxq 32(%rsi),%rax,%rcx
+ adoxq %rax,%r13
+ adcxq %rcx,%r14
+
+.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
+ adoxq %rdi,%r14
+ adcxq %r9,%r15
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
+ adoxq %rax,%r15
+ adcxq %rcx,%r8
+
+ mulxq 56(%rsi),%rdi,%r9
+ adoxq %rdi,%r8
+ adcxq %rbp,%r9
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r9
+ movq 24(%rsi),%rdx
+
+ xorq %rcx,%rcx
+ adoxq %r11,%r11
+
+ adcxq %rbx,%rax
+ adoxq %r12,%r12
+ adcxq %rax,%r11
+ adoxq %rbp,%rcx
+ adcxq %rdi,%r12
+ adcxq %rbp,%rcx
+
+ movq %r11,32(%rsp)
+ movq %r12,40(%rsp)
+
+
+ mulxq 32(%rsi),%rax,%rbx
+ adoxq %rax,%r14
+ adcxq %rbx,%r15
+
+ mulxq 40(%rsi),%rdi,%r10
+ adoxq %rdi,%r15
+ adcxq %r10,%r8
+
+ mulxq 48(%rsi),%rax,%rbx
+ adoxq %rax,%r8
+ adcxq %rbx,%r9
+
+ mulxq 56(%rsi),%rdi,%r10
+ adoxq %rdi,%r9
+ adcxq %rbp,%r10
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r10
+ movq 32(%rsi),%rdx
+
+ xorq %rbx,%rbx
+ adoxq %r13,%r13
+
+ adcxq %rcx,%rax
+ adoxq %r14,%r14
+ adcxq %rax,%r13
+ adoxq %rbp,%rbx
+ adcxq %rdi,%r14
+ adcxq %rbp,%rbx
+
+ movq %r13,48(%rsp)
+ movq %r14,56(%rsp)
+
+
+ mulxq 40(%rsi),%rdi,%r11
+ adoxq %rdi,%r8
+ adcxq %r11,%r9
+
+ mulxq 48(%rsi),%rax,%rcx
+ adoxq %rax,%r9
+ adcxq %rcx,%r10
+
+ mulxq 56(%rsi),%rdi,%r11
+ adoxq %rdi,%r10
+ adcxq %rbp,%r11
+ mulxq %rdx,%rax,%rdi
+ movq 40(%rsi),%rdx
+ adoxq %rbp,%r11
+
+ xorq %rcx,%rcx
+ adoxq %r15,%r15
+
+ adcxq %rbx,%rax
+ adoxq %r8,%r8
+ adcxq %rax,%r15
+ adoxq %rbp,%rcx
+ adcxq %rdi,%r8
+ adcxq %rbp,%rcx
+
+ movq %r15,64(%rsp)
+ movq %r8,72(%rsp)
+
+
+.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adoxq %rax,%r10
+ adcxq %rbx,%r11
+
+.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
+ adoxq %rdi,%r11
+ adcxq %rbp,%r12
+ mulxq %rdx,%rax,%rdi
+ adoxq %rbp,%r12
+ movq 48(%rsi),%rdx
+
+ xorq %rbx,%rbx
+ adoxq %r9,%r9
+
+ adcxq %rcx,%rax
+ adoxq %r10,%r10
+ adcxq %rax,%r9
+ adcxq %rdi,%r10
+ adoxq %rbp,%rbx
+ adcxq %rbp,%rbx
+
+ movq %r9,80(%rsp)
+ movq %r10,88(%rsp)
+
+
+.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
+ adoxq %rax,%r12
+ adoxq %rbp,%r13
+
+ mulxq %rdx,%rax,%rdi
+ xorq %rcx,%rcx
+ movq 56(%rsi),%rdx
+ adoxq %r11,%r11
+
+ adcxq %rbx,%rax
+ adoxq %r12,%r12
+ adcxq %rax,%r11
+ adoxq %rbp,%rcx
+ adcxq %rdi,%r12
+ adcxq %rbp,%rcx
+
+.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
+.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
+
+
+ mulxq %rdx,%rax,%rdx
+ xorq %rbx,%rbx
+ adoxq %r13,%r13
+
+ adcxq %rcx,%rax
+ adoxq %rbp,%rbx
+ adcxq %r13,%rax
+ adcxq %rdx,%rbx
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq 128(%rsp),%rdx
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ movq %rax,112(%rsp)
+ movq %rbx,120(%rsp)
+
+ call __rsaz_512_reducex
+
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8,%rdx
+ movq %r9,%rax
+ movl 128+8(%rsp),%r8d
+ movq %rdi,%rsi
+
+ decl %r8d
+ jnz .Loop_sqrx
+
+.Lsqr_tail:
leaq 128+24+48(%rsp),%rax
.cfi_def_cfa %rax,8
@@ -456,6 +736,10 @@ rsaz_512_mul:
.byte 102,72,15,110,199
.byte 102,72,15,110,201
movq %r8,128(%rsp)
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx
movq (%rdx),%rbx
movq %rdx,%rbp
call __rsaz_512_mul
@@ -473,6 +757,29 @@ rsaz_512_mul:
movq 56(%rsp),%r15
call __rsaz_512_reduce
+ jmp .Lmul_tail
+
+.align 32
+.Lmulx:
+ movq %rdx,%rbp
+ movq (%rdx),%rdx
+ call __rsaz_512_mulx
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq 128(%rsp),%rdx
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reducex
+.Lmul_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -586,6 +893,10 @@ rsaz_512_mul_gather4:
por %xmm9,%xmm8
pshufd $0x4e,%xmm8,%xmm9
por %xmm9,%xmm8
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx_gather
.byte 102,76,15,126,195
movq %r8,128(%rsp)
@@ -766,6 +1077,142 @@ rsaz_512_mul_gather4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
+ jmp .Lmul_gather_tail
+
+.align 32
+.Lmulx_gather:
+.byte 102,76,15,126,194
+
+ movq %r8,128(%rsp)
+ movq %rdi,128+8(%rsp)
+ movq %rcx,128+16(%rsp)
+
+ mulxq (%rsi),%rbx,%r8
+ movq %rbx,(%rsp)
+ xorl %edi,%edi
+
+ mulxq 8(%rsi),%rax,%r9
+
+ mulxq 16(%rsi),%rbx,%r10
+ adcxq %rax,%r8
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rbx,%r9
+
+ mulxq 32(%rsi),%rbx,%r12
+ adcxq %rax,%r10
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rbx,%r11
+
+ mulxq 48(%rsi),%rbx,%r14
+ adcxq %rax,%r12
+
+ mulxq 56(%rsi),%rax,%r15
+ adcxq %rbx,%r13
+ adcxq %rax,%r14
+.byte 0x67
+ movq %r8,%rbx
+ adcxq %rdi,%r15
+
+ movq $-7,%rcx
+ jmp .Loop_mulx_gather
+
+.align 32
+.Loop_mulx_gather:
+ movdqa 0(%rbp),%xmm8
+ movdqa 16(%rbp),%xmm9
+ movdqa 32(%rbp),%xmm10
+ movdqa 48(%rbp),%xmm11
+ pand %xmm0,%xmm8
+ movdqa 64(%rbp),%xmm12
+ pand %xmm1,%xmm9
+ movdqa 80(%rbp),%xmm13
+ pand %xmm2,%xmm10
+ movdqa 96(%rbp),%xmm14
+ pand %xmm3,%xmm11
+ movdqa 112(%rbp),%xmm15
+ leaq 128(%rbp),%rbp
+ pand %xmm4,%xmm12
+ pand %xmm5,%xmm13
+ pand %xmm6,%xmm14
+ pand %xmm7,%xmm15
+ por %xmm10,%xmm8
+ por %xmm11,%xmm9
+ por %xmm12,%xmm8
+ por %xmm13,%xmm9
+ por %xmm14,%xmm8
+ por %xmm15,%xmm9
+
+ por %xmm9,%xmm8
+ pshufd $0x4e,%xmm8,%xmm9
+ por %xmm9,%xmm8
+.byte 102,76,15,126,194
+
+.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rsi),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rsi),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+ mulxq 32(%rsi),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
+ adcxq %rax,%r13
+.byte 0x67
+ adoxq %r15,%r14
+
+ mulxq 56(%rsi),%rax,%r15
+ movq %rbx,64(%rsp,%rcx,8)
+ adcxq %rax,%r14
+ adoxq %rdi,%r15
+ movq %r8,%rbx
+ adcxq %rdi,%r15
+
+ incq %rcx
+ jnz .Loop_mulx_gather
+
+ movq %r8,64(%rsp)
+ movq %r9,64+8(%rsp)
+ movq %r10,64+16(%rsp)
+ movq %r11,64+24(%rsp)
+ movq %r12,64+32(%rsp)
+ movq %r13,64+40(%rsp)
+ movq %r14,64+48(%rsp)
+ movq %r15,64+56(%rsp)
+
+ movq 128(%rsp),%rdx
+ movq 128+8(%rsp),%rdi
+ movq 128+16(%rsp),%rbp
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reducex
+
+.Lmul_gather_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -833,6 +1280,10 @@ rsaz_512_mul_scatter4:
movq %rcx,128(%rsp)
movq %rdi,%rbp
+ movl $0x80100,%r11d
+ andl OPENSSL_ia32cap_P+8(%rip),%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx_scatter
movq (%rdi),%rbx
call __rsaz_512_mul
@@ -849,6 +1300,29 @@ rsaz_512_mul_scatter4:
movq 56(%rsp),%r15
call __rsaz_512_reduce
+ jmp .Lmul_scatter_tail
+
+.align 32
+.Lmulx_scatter:
+ movq (%rdi),%rdx
+ call __rsaz_512_mulx
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq 128(%rsp),%rdx
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reducex
+
+.Lmul_scatter_tail:
addq 64(%rsp),%r8
adcq 72(%rsp),%r9
adcq 80(%rsp),%r10
@@ -918,6 +1392,7 @@ rsaz_512_mul_by_one:
subq $128+24,%rsp
.cfi_adjust_cfa_offset 128+24
.Lmul_by_one_body:
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
movq %rdx,%rbp
movq %rcx,128(%rsp)
@@ -938,7 +1413,16 @@ rsaz_512_mul_by_one:
movdqa %xmm0,64(%rsp)
movdqa %xmm0,80(%rsp)
movdqa %xmm0,96(%rsp)
+ andl $0x80100,%eax
+ cmpl $0x80100,%eax
+ je .Lby_one_callx
call __rsaz_512_reduce
+ jmp .Lby_one_tail
+.align 32
+.Lby_one_callx:
+ movq 128(%rsp),%rdx
+ call __rsaz_512_reducex
+.Lby_one_tail:
movq %r8,(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
@@ -1053,6 +1537,64 @@ __rsaz_512_reduce:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_reduce,.-__rsaz_512_reduce
+.type __rsaz_512_reducex,@function
+.align 32
+__rsaz_512_reducex:
+.cfi_startproc
+
+ imulq %r8,%rdx
+ xorq %rsi,%rsi
+ movl $8,%ecx
+ jmp .Lreduction_loopx
+
+.align 32
+.Lreduction_loopx:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 128+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+ decl %ecx
+ jne .Lreduction_loopx
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __rsaz_512_reducex,.-__rsaz_512_reducex
.type __rsaz_512_subtract,@function
.align 32
__rsaz_512_subtract:
@@ -1256,6 +1798,128 @@ __rsaz_512_mul:
.byte 0xf3,0xc3
.cfi_endproc
.size __rsaz_512_mul,.-__rsaz_512_mul
+.type __rsaz_512_mulx,@function
+.align 32
+__rsaz_512_mulx:
+.cfi_startproc
+ mulxq (%rsi),%rbx,%r8
+ movq $-6,%rcx
+
+ mulxq 8(%rsi),%rax,%r9
+ movq %rbx,8(%rsp)
+
+ mulxq 16(%rsi),%rbx,%r10
+ adcq %rax,%r8
+
+ mulxq 24(%rsi),%rax,%r11
+ adcq %rbx,%r9
+
+ mulxq 32(%rsi),%rbx,%r12
+ adcq %rax,%r10
+
+ mulxq 40(%rsi),%rax,%r13
+ adcq %rbx,%r11
+
+ mulxq 48(%rsi),%rbx,%r14
+ adcq %rax,%r12
+
+ mulxq 56(%rsi),%rax,%r15
+ movq 8(%rbp),%rdx
+ adcq %rbx,%r13
+ adcq %rax,%r14
+ adcq $0,%r15
+
+ xorq %rdi,%rdi
+ jmp .Loop_mulx
+
+.align 32
+.Loop_mulx:
+ movq %r8,%rbx
+ mulxq (%rsi),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rsi),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rsi),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rsi),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rsi),%rax,%r15
+ movq 64(%rbp,%rcx,8),%rdx
+ movq %rbx,8+64-8(%rsp,%rcx,8)
+ adcxq %rax,%r14
+ adoxq %rdi,%r15
+ adcxq %rdi,%r15
+
+ incq %rcx
+ jnz .Loop_mulx
+
+ movq %r8,%rbx
+ mulxq (%rsi),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rsi),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+ mulxq 32(%rsi),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rsi),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
+ adcxq %rax,%r14
+ adoxq %rdi,%r15
+ adcxq %rdi,%r15
+
+ movq %rbx,8+64-8(%rsp)
+ movq %r8,8+64(%rsp)
+ movq %r9,8+64+8(%rsp)
+ movq %r10,8+64+16(%rsp)
+ movq %r11,8+64+24(%rsp)
+ movq %r12,8+64+32(%rsp)
+ movq %r13,8+64+40(%rsp)
+ movq %r14,8+64+48(%rsp)
+ movq %r15,8+64+56(%rsp)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __rsaz_512_mulx,.-__rsaz_512_mulx
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,@function
.align 16
diff --git a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
index 488e554c247e..0090e020c573 100644
--- a/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S
@@ -12,6 +12,8 @@ sha1_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
+ testl $268435456,%ecx
+ jnz _avx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -2937,6 +2939,4319 @@ _shaext_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_multi_block_shaext,.-sha1_multi_block_shaext
+.type sha1_multi_block_avx,@function
+.align 32
+sha1_multi_block_avx:
+.cfi_startproc
+_avx_shortcut:
+ shrq $32,%rcx
+ cmpl $2,%edx
+ jb .Lavx
+ testl $32,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody_avx:
+ leaq K_XX_XX(%rip),%rbp
+ leaq 256(%rsp),%rbx
+
+ vzeroupper
+.Loop_grande_avx:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone_avx
+
+ vmovdqu 0(%rdi),%xmm10
+ leaq 128(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm11
+ vmovdqu 64(%rdi),%xmm12
+ vmovdqu 96(%rdi),%xmm13
+ vmovdqu 128(%rdi),%xmm14
+ vmovdqu 96(%rbp),%xmm5
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vmovdqa -32(%rbp),%xmm15
+ vmovd (%r8),%xmm0
+ leaq 64(%r8),%r8
+ vmovd (%r9),%xmm2
+ leaq 64(%r9),%r9
+ vpinsrd $1,(%r10),%xmm0,%xmm0
+ leaq 64(%r10),%r10
+ vpinsrd $1,(%r11),%xmm2,%xmm2
+ leaq 64(%r11),%r11
+ vmovd -60(%r8),%xmm1
+ vpunpckldq %xmm2,%xmm0,%xmm0
+ vmovd -60(%r9),%xmm9
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpinsrd $1,-60(%r10),%xmm1,%xmm1
+ vpinsrd $1,-60(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,0-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -56(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -56(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-56(%r10),%xmm2,%xmm2
+ vpinsrd $1,-56(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,16-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -52(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -52(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-52(%r10),%xmm3,%xmm3
+ vpinsrd $1,-52(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,32-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -48(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -48(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-48(%r10),%xmm4,%xmm4
+ vpinsrd $1,-48(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,48-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -44(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -44(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpinsrd $1,-44(%r10),%xmm0,%xmm0
+ vpinsrd $1,-44(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,64-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -40(%r8),%xmm1
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -40(%r9),%xmm9
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpinsrd $1,-40(%r10),%xmm1,%xmm1
+ vpinsrd $1,-40(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,80-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -36(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -36(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-36(%r10),%xmm2,%xmm2
+ vpinsrd $1,-36(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,96-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -32(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -32(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-32(%r10),%xmm3,%xmm3
+ vpinsrd $1,-32(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,112-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -28(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -28(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-28(%r10),%xmm4,%xmm4
+ vpinsrd $1,-28(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,128-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -24(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -24(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpinsrd $1,-24(%r10),%xmm0,%xmm0
+ vpinsrd $1,-24(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,144-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -20(%r8),%xmm1
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -20(%r9),%xmm9
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpinsrd $1,-20(%r10),%xmm1,%xmm1
+ vpinsrd $1,-20(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,160-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpunpckldq %xmm9,%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -16(%r8),%xmm2
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -16(%r9),%xmm9
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpshufb %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpinsrd $1,-16(%r10),%xmm2,%xmm2
+ vpinsrd $1,-16(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,176-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpunpckldq %xmm9,%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -12(%r8),%xmm3
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -12(%r9),%xmm9
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpshufb %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpinsrd $1,-12(%r10),%xmm3,%xmm3
+ vpinsrd $1,-12(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,192-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpunpckldq %xmm9,%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -8(%r8),%xmm4
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -8(%r9),%xmm9
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpshufb %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpinsrd $1,-8(%r10),%xmm4,%xmm4
+ vpinsrd $1,-8(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,208-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpunpckldq %xmm9,%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovd -4(%r8),%xmm0
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vmovd -4(%r9),%xmm9
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpshufb %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vmovdqa 0-128(%rax),%xmm1
+ vpinsrd $1,-4(%r10),%xmm0,%xmm0
+ vpinsrd $1,-4(%r11),%xmm9,%xmm9
+ vpaddd %xmm15,%xmm10,%xmm10
+ prefetcht0 63(%r8)
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,224-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpunpckldq %xmm9,%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ prefetcht0 63(%r9)
+ vpxor %xmm7,%xmm6,%xmm6
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ prefetcht0 63(%r10)
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ prefetcht0 63(%r11)
+ vpshufb %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 16-128(%rax),%xmm2
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 32-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpandn %xmm13,%xmm11,%xmm7
+
+ vpand %xmm12,%xmm11,%xmm6
+
+ vmovdqa %xmm0,240-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 128-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 48-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpandn %xmm12,%xmm10,%xmm7
+
+ vpand %xmm11,%xmm10,%xmm6
+
+ vmovdqa %xmm1,0-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 144-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 64-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpandn %xmm11,%xmm14,%xmm7
+
+ vpand %xmm10,%xmm14,%xmm6
+
+ vmovdqa %xmm2,16-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 160-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 80-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpandn %xmm10,%xmm13,%xmm7
+
+ vpand %xmm14,%xmm13,%xmm6
+
+ vmovdqa %xmm3,32-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 176-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 96-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpandn %xmm14,%xmm12,%xmm7
+
+ vpand %xmm13,%xmm12,%xmm6
+
+ vmovdqa %xmm4,48-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 192-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm7,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 0(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 112-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,64-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 208-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 128-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,80-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 224-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 144-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,96-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 240-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 160-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,112-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 0-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 176-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,128-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 16-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 192-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,144-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 32-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 208-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,160-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 48-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 224-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,176-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 64-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 240-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,192-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 80-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 0-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,208-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 96-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 16-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,224-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 112-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 32-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,240-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 128-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 48-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,0-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 144-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 64-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,16-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 160-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 80-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,32-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 176-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 96-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,48-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 192-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 112-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,64-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 208-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 128-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,80-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 224-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 144-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,96-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 240-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 160-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,112-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 0-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 32(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 176-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 16-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,128-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 192-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 32-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,144-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 208-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 48-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,160-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 224-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 64-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,176-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 240-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 80-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,192-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 0-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 96-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,208-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 16-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 112-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,224-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 32-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 128-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,240-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 48-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 144-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,0-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 64-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 160-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,16-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 80-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 176-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,32-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 96-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 192-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,48-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 112-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 208-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,64-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 128-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 224-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,80-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 144-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 240-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,96-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 160-128(%rax),%xmm3
+
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpslld $5,%xmm10,%xmm8
+ vpand %xmm12,%xmm13,%xmm7
+ vpxor 0-128(%rax),%xmm1,%xmm1
+
+ vpaddd %xmm7,%xmm14,%xmm14
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm13,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vmovdqu %xmm0,112-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm1,%xmm5
+ vpand %xmm11,%xmm6,%xmm6
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 176-128(%rax),%xmm4
+
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpslld $5,%xmm14,%xmm8
+ vpand %xmm11,%xmm12,%xmm7
+ vpxor 16-128(%rax),%xmm2,%xmm2
+
+ vpaddd %xmm7,%xmm13,%xmm13
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm12,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vmovdqu %xmm1,128-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm2,%xmm5
+ vpand %xmm10,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpaddd %xmm6,%xmm13,%xmm13
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 192-128(%rax),%xmm0
+
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpslld $5,%xmm13,%xmm8
+ vpand %xmm10,%xmm11,%xmm7
+ vpxor 32-128(%rax),%xmm3,%xmm3
+
+ vpaddd %xmm7,%xmm12,%xmm12
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm11,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vmovdqu %xmm2,144-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm3,%xmm5
+ vpand %xmm14,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 208-128(%rax),%xmm1
+
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpslld $5,%xmm12,%xmm8
+ vpand %xmm14,%xmm10,%xmm7
+ vpxor 48-128(%rax),%xmm4,%xmm4
+
+ vpaddd %xmm7,%xmm11,%xmm11
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm10,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqu %xmm3,160-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm4,%xmm5
+ vpand %xmm13,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpaddd %xmm6,%xmm11,%xmm11
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 224-128(%rax),%xmm2
+
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpslld $5,%xmm11,%xmm8
+ vpand %xmm13,%xmm14,%xmm7
+ vpxor 64-128(%rax),%xmm0,%xmm0
+
+ vpaddd %xmm7,%xmm10,%xmm10
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm14,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vmovdqu %xmm4,176-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpor %xmm9,%xmm8,%xmm8
+ vpsrld $31,%xmm0,%xmm5
+ vpand %xmm12,%xmm6,%xmm6
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vmovdqa 64(%rbp),%xmm15
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 240-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,192-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 80-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 0-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,208-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 96-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 16-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,224-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 112-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 32-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,240-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 128-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 48-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,0-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 144-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 64-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,16-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 160-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 80-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,32-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 176-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 96-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vmovdqa %xmm2,48-128(%rax)
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 192-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 112-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vmovdqa %xmm3,64-128(%rax)
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 208-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 128-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vmovdqa %xmm4,80-128(%rax)
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 224-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 144-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vmovdqa %xmm0,96-128(%rax)
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 240-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 160-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vmovdqa %xmm1,112-128(%rax)
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 0-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 176-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 16-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 192-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 32-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa 208-128(%rax),%xmm2
+
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor 48-128(%rax),%xmm0,%xmm0
+ vpsrld $27,%xmm11,%xmm9
+ vpxor %xmm13,%xmm6,%xmm6
+ vpxor %xmm2,%xmm0,%xmm0
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpsrld $31,%xmm0,%xmm5
+ vpaddd %xmm0,%xmm0,%xmm0
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm5,%xmm0,%xmm0
+ vpor %xmm7,%xmm12,%xmm12
+ vpxor %xmm3,%xmm1,%xmm1
+ vmovdqa 224-128(%rax),%xmm3
+
+ vpslld $5,%xmm10,%xmm8
+ vpaddd %xmm15,%xmm14,%xmm14
+ vpxor %xmm11,%xmm13,%xmm6
+ vpaddd %xmm0,%xmm14,%xmm14
+ vpxor 64-128(%rax),%xmm1,%xmm1
+ vpsrld $27,%xmm10,%xmm9
+ vpxor %xmm12,%xmm6,%xmm6
+ vpxor %xmm3,%xmm1,%xmm1
+
+ vpslld $30,%xmm11,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpsrld $31,%xmm1,%xmm5
+ vpaddd %xmm1,%xmm1,%xmm1
+
+ vpsrld $2,%xmm11,%xmm11
+ vpaddd %xmm8,%xmm14,%xmm14
+ vpor %xmm5,%xmm1,%xmm1
+ vpor %xmm7,%xmm11,%xmm11
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa 240-128(%rax),%xmm4
+
+ vpslld $5,%xmm14,%xmm8
+ vpaddd %xmm15,%xmm13,%xmm13
+ vpxor %xmm10,%xmm12,%xmm6
+ vpaddd %xmm1,%xmm13,%xmm13
+ vpxor 80-128(%rax),%xmm2,%xmm2
+ vpsrld $27,%xmm14,%xmm9
+ vpxor %xmm11,%xmm6,%xmm6
+ vpxor %xmm4,%xmm2,%xmm2
+
+ vpslld $30,%xmm10,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm13,%xmm13
+ vpsrld $31,%xmm2,%xmm5
+ vpaddd %xmm2,%xmm2,%xmm2
+
+ vpsrld $2,%xmm10,%xmm10
+ vpaddd %xmm8,%xmm13,%xmm13
+ vpor %xmm5,%xmm2,%xmm2
+ vpor %xmm7,%xmm10,%xmm10
+ vpxor %xmm0,%xmm3,%xmm3
+ vmovdqa 0-128(%rax),%xmm0
+
+ vpslld $5,%xmm13,%xmm8
+ vpaddd %xmm15,%xmm12,%xmm12
+ vpxor %xmm14,%xmm11,%xmm6
+ vpaddd %xmm2,%xmm12,%xmm12
+ vpxor 96-128(%rax),%xmm3,%xmm3
+ vpsrld $27,%xmm13,%xmm9
+ vpxor %xmm10,%xmm6,%xmm6
+ vpxor %xmm0,%xmm3,%xmm3
+
+ vpslld $30,%xmm14,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpsrld $31,%xmm3,%xmm5
+ vpaddd %xmm3,%xmm3,%xmm3
+
+ vpsrld $2,%xmm14,%xmm14
+ vpaddd %xmm8,%xmm12,%xmm12
+ vpor %xmm5,%xmm3,%xmm3
+ vpor %xmm7,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqa 16-128(%rax),%xmm1
+
+ vpslld $5,%xmm12,%xmm8
+ vpaddd %xmm15,%xmm11,%xmm11
+ vpxor %xmm13,%xmm10,%xmm6
+ vpaddd %xmm3,%xmm11,%xmm11
+ vpxor 112-128(%rax),%xmm4,%xmm4
+ vpsrld $27,%xmm12,%xmm9
+ vpxor %xmm14,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vpslld $30,%xmm13,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm11,%xmm11
+ vpsrld $31,%xmm4,%xmm5
+ vpaddd %xmm4,%xmm4,%xmm4
+
+ vpsrld $2,%xmm13,%xmm13
+ vpaddd %xmm8,%xmm11,%xmm11
+ vpor %xmm5,%xmm4,%xmm4
+ vpor %xmm7,%xmm13,%xmm13
+ vpslld $5,%xmm11,%xmm8
+ vpaddd %xmm15,%xmm10,%xmm10
+ vpxor %xmm12,%xmm14,%xmm6
+
+ vpsrld $27,%xmm11,%xmm9
+ vpaddd %xmm4,%xmm10,%xmm10
+ vpxor %xmm13,%xmm6,%xmm6
+
+ vpslld $30,%xmm12,%xmm7
+ vpor %xmm9,%xmm8,%xmm8
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpsrld $2,%xmm12,%xmm12
+ vpaddd %xmm8,%xmm10,%xmm10
+ vpor %xmm7,%xmm12,%xmm12
+ movl $1,%ecx
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqu (%rbx),%xmm6
+ vpxor %xmm8,%xmm8,%xmm8
+ vmovdqa %xmm6,%xmm7
+ vpcmpgtd %xmm8,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpand %xmm7,%xmm10,%xmm10
+ vpand %xmm7,%xmm11,%xmm11
+ vpaddd 0(%rdi),%xmm10,%xmm10
+ vpand %xmm7,%xmm12,%xmm12
+ vpaddd 32(%rdi),%xmm11,%xmm11
+ vpand %xmm7,%xmm13,%xmm13
+ vpaddd 64(%rdi),%xmm12,%xmm12
+ vpand %xmm7,%xmm14,%xmm14
+ vpaddd 96(%rdi),%xmm13,%xmm13
+ vpaddd 128(%rdi),%xmm14,%xmm14
+ vmovdqu %xmm10,0(%rdi)
+ vmovdqu %xmm11,32(%rdi)
+ vmovdqu %xmm12,64(%rdi)
+ vmovdqu %xmm13,96(%rdi)
+ vmovdqu %xmm14,128(%rdi)
+
+ vmovdqu %xmm6,(%rbx)
+ vmovdqu 96(%rbp),%xmm5
+ decl %edx
+ jnz .Loop_avx
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_avx,.-sha1_multi_block_avx
+.type sha1_multi_block_avx2,@function
+.align 32
+sha1_multi_block_avx2:
+.cfi_startproc
+_avx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-256,%rsp
+ movq %rax,544(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
+.Lbody_avx2:
+ leaq K_XX_XX(%rip),%rbp
+ shrl $1,%edx
+
+ vzeroupper
+.Loop_grande_avx2:
+ movl %edx,552(%rsp)
+ xorl %edx,%edx
+ leaq 512(%rsp),%rbx
+ movq 0(%rsi),%r12
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r12
+ movq 16(%rsi),%r13
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r13
+ movq 32(%rsi),%r14
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r14
+ movq 48(%rsi),%r15
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r15
+ movq 64(%rsi),%r8
+ movl 72(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,16(%rbx)
+ cmovleq %rbp,%r8
+ movq 80(%rsi),%r9
+ movl 88(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,20(%rbx)
+ cmovleq %rbp,%r9
+ movq 96(%rsi),%r10
+ movl 104(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,24(%rbx)
+ cmovleq %rbp,%r10
+ movq 112(%rsi),%r11
+ movl 120(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,28(%rbx)
+ cmovleq %rbp,%r11
+ vmovdqu 0(%rdi),%ymm0
+ leaq 128(%rsp),%rax
+ vmovdqu 32(%rdi),%ymm1
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 64(%rdi),%ymm2
+ vmovdqu 96(%rdi),%ymm3
+ vmovdqu 128(%rdi),%ymm4
+ vmovdqu 96(%rbp),%ymm9
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ vmovdqa -32(%rbp),%ymm15
+ vmovd (%r12),%xmm10
+ leaq 64(%r12),%r12
+ vmovd (%r8),%xmm12
+ leaq 64(%r8),%r8
+ vmovd (%r13),%xmm7
+ leaq 64(%r13),%r13
+ vmovd (%r9),%xmm6
+ leaq 64(%r9),%r9
+ vpinsrd $1,(%r14),%xmm10,%xmm10
+ leaq 64(%r14),%r14
+ vpinsrd $1,(%r10),%xmm12,%xmm12
+ leaq 64(%r10),%r10
+ vpinsrd $1,(%r15),%xmm7,%xmm7
+ leaq 64(%r15),%r15
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,(%r11),%xmm6,%xmm6
+ leaq 64(%r11),%r11
+ vpunpckldq %ymm6,%ymm12,%ymm12
+ vmovd -60(%r12),%xmm11
+ vinserti128 $1,%xmm12,%ymm10,%ymm10
+ vmovd -60(%r8),%xmm8
+ vpshufb %ymm9,%ymm10,%ymm10
+ vmovd -60(%r13),%xmm7
+ vmovd -60(%r9),%xmm6
+ vpinsrd $1,-60(%r14),%xmm11,%xmm11
+ vpinsrd $1,-60(%r10),%xmm8,%xmm8
+ vpinsrd $1,-60(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-60(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,0-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -56(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -56(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -56(%r13),%xmm7
+ vmovd -56(%r9),%xmm6
+ vpinsrd $1,-56(%r14),%xmm12,%xmm12
+ vpinsrd $1,-56(%r10),%xmm8,%xmm8
+ vpinsrd $1,-56(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-56(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,32-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -52(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -52(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -52(%r13),%xmm7
+ vmovd -52(%r9),%xmm6
+ vpinsrd $1,-52(%r14),%xmm13,%xmm13
+ vpinsrd $1,-52(%r10),%xmm8,%xmm8
+ vpinsrd $1,-52(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-52(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,64-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -48(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -48(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -48(%r13),%xmm7
+ vmovd -48(%r9),%xmm6
+ vpinsrd $1,-48(%r14),%xmm14,%xmm14
+ vpinsrd $1,-48(%r10),%xmm8,%xmm8
+ vpinsrd $1,-48(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-48(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,96-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -44(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -44(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovd -44(%r13),%xmm7
+ vmovd -44(%r9),%xmm6
+ vpinsrd $1,-44(%r14),%xmm10,%xmm10
+ vpinsrd $1,-44(%r10),%xmm8,%xmm8
+ vpinsrd $1,-44(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-44(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,128-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -40(%r12),%xmm11
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -40(%r8),%xmm8
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovd -40(%r13),%xmm7
+ vmovd -40(%r9),%xmm6
+ vpinsrd $1,-40(%r14),%xmm11,%xmm11
+ vpinsrd $1,-40(%r10),%xmm8,%xmm8
+ vpinsrd $1,-40(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-40(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,160-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -36(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -36(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -36(%r13),%xmm7
+ vmovd -36(%r9),%xmm6
+ vpinsrd $1,-36(%r14),%xmm12,%xmm12
+ vpinsrd $1,-36(%r10),%xmm8,%xmm8
+ vpinsrd $1,-36(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-36(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,192-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -32(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -32(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -32(%r13),%xmm7
+ vmovd -32(%r9),%xmm6
+ vpinsrd $1,-32(%r14),%xmm13,%xmm13
+ vpinsrd $1,-32(%r10),%xmm8,%xmm8
+ vpinsrd $1,-32(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-32(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,224-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -28(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -28(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -28(%r13),%xmm7
+ vmovd -28(%r9),%xmm6
+ vpinsrd $1,-28(%r14),%xmm14,%xmm14
+ vpinsrd $1,-28(%r10),%xmm8,%xmm8
+ vpinsrd $1,-28(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-28(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,256-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -24(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -24(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovd -24(%r13),%xmm7
+ vmovd -24(%r9),%xmm6
+ vpinsrd $1,-24(%r14),%xmm10,%xmm10
+ vpinsrd $1,-24(%r10),%xmm8,%xmm8
+ vpinsrd $1,-24(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-24(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -20(%r12),%xmm11
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -20(%r8),%xmm8
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovd -20(%r13),%xmm7
+ vmovd -20(%r9),%xmm6
+ vpinsrd $1,-20(%r14),%xmm11,%xmm11
+ vpinsrd $1,-20(%r10),%xmm8,%xmm8
+ vpinsrd $1,-20(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm11,%ymm11
+ vpinsrd $1,-20(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,320-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vinserti128 $1,%xmm8,%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -16(%r12),%xmm12
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -16(%r8),%xmm8
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpshufb %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vmovd -16(%r13),%xmm7
+ vmovd -16(%r9),%xmm6
+ vpinsrd $1,-16(%r14),%xmm12,%xmm12
+ vpinsrd $1,-16(%r10),%xmm8,%xmm8
+ vpinsrd $1,-16(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm12,%ymm12
+ vpinsrd $1,-16(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,352-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vinserti128 $1,%xmm8,%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -12(%r12),%xmm13
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -12(%r8),%xmm8
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vmovd -12(%r13),%xmm7
+ vmovd -12(%r9),%xmm6
+ vpinsrd $1,-12(%r14),%xmm13,%xmm13
+ vpinsrd $1,-12(%r10),%xmm8,%xmm8
+ vpinsrd $1,-12(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm13,%ymm13
+ vpinsrd $1,-12(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,384-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vinserti128 $1,%xmm8,%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -8(%r12),%xmm14
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -8(%r8),%xmm8
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpshufb %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vmovd -8(%r13),%xmm7
+ vmovd -8(%r9),%xmm6
+ vpinsrd $1,-8(%r14),%xmm14,%xmm14
+ vpinsrd $1,-8(%r10),%xmm8,%xmm8
+ vpinsrd $1,-8(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm14,%ymm14
+ vpinsrd $1,-8(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,416-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vinserti128 $1,%xmm8,%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovd -4(%r12),%xmm10
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vmovd -4(%r8),%xmm8
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpshufb %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vmovdqa 0-128(%rax),%ymm11
+ vmovd -4(%r13),%xmm7
+ vmovd -4(%r9),%xmm6
+ vpinsrd $1,-4(%r14),%xmm10,%xmm10
+ vpinsrd $1,-4(%r10),%xmm8,%xmm8
+ vpinsrd $1,-4(%r15),%xmm7,%xmm7
+ vpunpckldq %ymm7,%ymm10,%ymm10
+ vpinsrd $1,-4(%r11),%xmm6,%xmm6
+ vpunpckldq %ymm6,%ymm8,%ymm8
+ vpaddd %ymm15,%ymm0,%ymm0
+ prefetcht0 63(%r12)
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,448-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vinserti128 $1,%xmm8,%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ prefetcht0 63(%r13)
+ vpxor %ymm6,%ymm5,%ymm5
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ prefetcht0 63(%r14)
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ prefetcht0 63(%r15)
+ vpshufb %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 32-128(%rax),%ymm12
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 64-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpandn %ymm3,%ymm1,%ymm6
+ prefetcht0 63(%r8)
+ vpand %ymm2,%ymm1,%ymm5
+
+ vmovdqa %ymm10,480-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 256-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+ prefetcht0 63(%r9)
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ prefetcht0 63(%r10)
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ prefetcht0 63(%r11)
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 96-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpandn %ymm2,%ymm0,%ymm6
+
+ vpand %ymm1,%ymm0,%ymm5
+
+ vmovdqa %ymm11,0-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 288-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 128-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpandn %ymm1,%ymm4,%ymm6
+
+ vpand %ymm0,%ymm4,%ymm5
+
+ vmovdqa %ymm12,32-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 320-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 160-128(%rax),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpandn %ymm0,%ymm3,%ymm6
+
+ vpand %ymm4,%ymm3,%ymm5
+
+ vmovdqa %ymm13,64-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 352-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 192-128(%rax),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpandn %ymm4,%ymm2,%ymm6
+
+ vpand %ymm3,%ymm2,%ymm5
+
+ vmovdqa %ymm14,96-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 384-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm6,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 0(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 224-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,128-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 416-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 256-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,160-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 448-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 288-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,192-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 480-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 320-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,224-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 0-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 352-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,256-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 32-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 384-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,288-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 64-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 416-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,320-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 96-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 448-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 128-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 480-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,384-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 160-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 0-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,416-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 192-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 32-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,448-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 224-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 64-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,480-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 256-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 96-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,0-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 288-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 128-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,32-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 320-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 160-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,64-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 352-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 192-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,96-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 384-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 224-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,128-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 416-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 256-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,160-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 448-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 288-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,192-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 480-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 320-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,224-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 0-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 32(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 352-256-128(%rbx),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 32-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,256-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 384-256-128(%rbx),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 64-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,288-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 416-256-128(%rbx),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 96-128(%rax),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,320-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 448-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 128-128(%rax),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,352-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 480-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 160-128(%rax),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,384-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 0-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 192-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 32-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 224-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,448-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 64-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 256-256-128(%rbx),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,480-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 96-128(%rax),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 288-256-128(%rbx),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,0-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 128-128(%rax),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 320-256-128(%rbx),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,32-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 160-128(%rax),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 352-256-128(%rbx),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,64-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 192-128(%rax),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 384-256-128(%rbx),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,96-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 224-128(%rax),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 416-256-128(%rbx),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,128-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 256-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 448-256-128(%rbx),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,160-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 288-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 480-256-128(%rbx),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,192-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 320-256-128(%rbx),%ymm13
+
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpslld $5,%ymm0,%ymm7
+ vpand %ymm2,%ymm3,%ymm6
+ vpxor 0-128(%rax),%ymm11,%ymm11
+
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm3,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vmovdqu %ymm10,224-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm11,%ymm9
+ vpand %ymm1,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpaddd %ymm5,%ymm4,%ymm4
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 352-256-128(%rbx),%ymm14
+
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpslld $5,%ymm4,%ymm7
+ vpand %ymm1,%ymm2,%ymm6
+ vpxor 32-128(%rax),%ymm12,%ymm12
+
+ vpaddd %ymm6,%ymm3,%ymm3
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm2,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vmovdqu %ymm11,256-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm12,%ymm9
+ vpand %ymm0,%ymm5,%ymm5
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpaddd %ymm5,%ymm3,%ymm3
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 384-256-128(%rbx),%ymm10
+
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpslld $5,%ymm3,%ymm7
+ vpand %ymm0,%ymm1,%ymm6
+ vpxor 64-128(%rax),%ymm13,%ymm13
+
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm1,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vmovdqu %ymm12,288-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm13,%ymm9
+ vpand %ymm4,%ymm5,%ymm5
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpaddd %ymm5,%ymm2,%ymm2
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 416-256-128(%rbx),%ymm11
+
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpslld $5,%ymm2,%ymm7
+ vpand %ymm4,%ymm0,%ymm6
+ vpxor 96-128(%rax),%ymm14,%ymm14
+
+ vpaddd %ymm6,%ymm1,%ymm1
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm0,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vmovdqu %ymm13,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm14,%ymm9
+ vpand %ymm3,%ymm5,%ymm5
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 448-256-128(%rbx),%ymm12
+
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpslld $5,%ymm1,%ymm7
+ vpand %ymm3,%ymm4,%ymm6
+ vpxor 128-128(%rax),%ymm10,%ymm10
+
+ vpaddd %ymm6,%ymm0,%ymm0
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm4,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vmovdqu %ymm14,352-256-128(%rbx)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm10,%ymm9
+ vpand %ymm2,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vmovdqa 64(%rbp),%ymm15
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 480-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,384-256-128(%rbx)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 160-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 0-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,416-256-128(%rbx)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 192-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 32-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,448-256-128(%rbx)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 224-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 64-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,480-256-128(%rbx)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 256-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 96-128(%rax),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,0-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 288-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 128-128(%rax),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,32-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 320-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 160-128(%rax),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,64-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 352-256-128(%rbx),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 192-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vmovdqa %ymm12,96-128(%rax)
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 384-256-128(%rbx),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 224-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vmovdqa %ymm13,128-128(%rax)
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 416-256-128(%rbx),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 256-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vmovdqa %ymm14,160-128(%rax)
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 448-256-128(%rbx),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 288-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vmovdqa %ymm10,192-128(%rax)
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 480-256-128(%rbx),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 320-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vmovdqa %ymm11,224-128(%rax)
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 0-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 352-256-128(%rbx),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 32-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 384-256-128(%rbx),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 64-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpxor %ymm12,%ymm10,%ymm10
+ vmovdqa 416-256-128(%rbx),%ymm12
+
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor 96-128(%rax),%ymm10,%ymm10
+ vpsrld $27,%ymm1,%ymm8
+ vpxor %ymm3,%ymm5,%ymm5
+ vpxor %ymm12,%ymm10,%ymm10
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+ vpsrld $31,%ymm10,%ymm9
+ vpaddd %ymm10,%ymm10,%ymm10
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm9,%ymm10,%ymm10
+ vpor %ymm6,%ymm2,%ymm2
+ vpxor %ymm13,%ymm11,%ymm11
+ vmovdqa 448-256-128(%rbx),%ymm13
+
+ vpslld $5,%ymm0,%ymm7
+ vpaddd %ymm15,%ymm4,%ymm4
+ vpxor %ymm1,%ymm3,%ymm5
+ vpaddd %ymm10,%ymm4,%ymm4
+ vpxor 128-128(%rax),%ymm11,%ymm11
+ vpsrld $27,%ymm0,%ymm8
+ vpxor %ymm2,%ymm5,%ymm5
+ vpxor %ymm13,%ymm11,%ymm11
+
+ vpslld $30,%ymm1,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpsrld $31,%ymm11,%ymm9
+ vpaddd %ymm11,%ymm11,%ymm11
+
+ vpsrld $2,%ymm1,%ymm1
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpor %ymm9,%ymm11,%ymm11
+ vpor %ymm6,%ymm1,%ymm1
+ vpxor %ymm14,%ymm12,%ymm12
+ vmovdqa 480-256-128(%rbx),%ymm14
+
+ vpslld $5,%ymm4,%ymm7
+ vpaddd %ymm15,%ymm3,%ymm3
+ vpxor %ymm0,%ymm2,%ymm5
+ vpaddd %ymm11,%ymm3,%ymm3
+ vpxor 160-128(%rax),%ymm12,%ymm12
+ vpsrld $27,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm14,%ymm12,%ymm12
+
+ vpslld $30,%ymm0,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm3,%ymm3
+ vpsrld $31,%ymm12,%ymm9
+ vpaddd %ymm12,%ymm12,%ymm12
+
+ vpsrld $2,%ymm0,%ymm0
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpor %ymm9,%ymm12,%ymm12
+ vpor %ymm6,%ymm0,%ymm0
+ vpxor %ymm10,%ymm13,%ymm13
+ vmovdqa 0-128(%rax),%ymm10
+
+ vpslld $5,%ymm3,%ymm7
+ vpaddd %ymm15,%ymm2,%ymm2
+ vpxor %ymm4,%ymm1,%ymm5
+ vpaddd %ymm12,%ymm2,%ymm2
+ vpxor 192-128(%rax),%ymm13,%ymm13
+ vpsrld $27,%ymm3,%ymm8
+ vpxor %ymm0,%ymm5,%ymm5
+ vpxor %ymm10,%ymm13,%ymm13
+
+ vpslld $30,%ymm4,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm2,%ymm2
+ vpsrld $31,%ymm13,%ymm9
+ vpaddd %ymm13,%ymm13,%ymm13
+
+ vpsrld $2,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm2,%ymm2
+ vpor %ymm9,%ymm13,%ymm13
+ vpor %ymm6,%ymm4,%ymm4
+ vpxor %ymm11,%ymm14,%ymm14
+ vmovdqa 32-128(%rax),%ymm11
+
+ vpslld $5,%ymm2,%ymm7
+ vpaddd %ymm15,%ymm1,%ymm1
+ vpxor %ymm3,%ymm0,%ymm5
+ vpaddd %ymm13,%ymm1,%ymm1
+ vpxor 224-128(%rax),%ymm14,%ymm14
+ vpsrld $27,%ymm2,%ymm8
+ vpxor %ymm4,%ymm5,%ymm5
+ vpxor %ymm11,%ymm14,%ymm14
+
+ vpslld $30,%ymm3,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpsrld $31,%ymm14,%ymm9
+ vpaddd %ymm14,%ymm14,%ymm14
+
+ vpsrld $2,%ymm3,%ymm3
+ vpaddd %ymm7,%ymm1,%ymm1
+ vpor %ymm9,%ymm14,%ymm14
+ vpor %ymm6,%ymm3,%ymm3
+ vpslld $5,%ymm1,%ymm7
+ vpaddd %ymm15,%ymm0,%ymm0
+ vpxor %ymm2,%ymm4,%ymm5
+
+ vpsrld $27,%ymm1,%ymm8
+ vpaddd %ymm14,%ymm0,%ymm0
+ vpxor %ymm3,%ymm5,%ymm5
+
+ vpslld $30,%ymm2,%ymm6
+ vpor %ymm8,%ymm7,%ymm7
+ vpaddd %ymm5,%ymm0,%ymm0
+
+ vpsrld $2,%ymm2,%ymm2
+ vpaddd %ymm7,%ymm0,%ymm0
+ vpor %ymm6,%ymm2,%ymm2
+ movl $1,%ecx
+ leaq 512(%rsp),%rbx
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r12
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r13
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r14
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r15
+ cmpl 16(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 20(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 24(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 28(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqu (%rbx),%ymm5
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm5,%ymm6
+ vpcmpgtd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm5,%ymm5
+
+ vpand %ymm6,%ymm0,%ymm0
+ vpand %ymm6,%ymm1,%ymm1
+ vpaddd 0(%rdi),%ymm0,%ymm0
+ vpand %ymm6,%ymm2,%ymm2
+ vpaddd 32(%rdi),%ymm1,%ymm1
+ vpand %ymm6,%ymm3,%ymm3
+ vpaddd 64(%rdi),%ymm2,%ymm2
+ vpand %ymm6,%ymm4,%ymm4
+ vpaddd 96(%rdi),%ymm3,%ymm3
+ vpaddd 128(%rdi),%ymm4,%ymm4
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm1,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm3,96(%rdi)
+ vmovdqu %ymm4,128(%rdi)
+
+ vmovdqu %ymm5,(%rbx)
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 96(%rbp),%ymm9
+ decl %edx
+ jnz .Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+ movq 544(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_multi_block_avx2,.-sha1_multi_block_avx2
.align 256
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/secure/lib/libcrypto/amd64/sha1-x86_64.S b/secure/lib/libcrypto/amd64/sha1-x86_64.S
index cf36e17d3121..342db5203d16 100644
--- a/secure/lib/libcrypto/amd64/sha1-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha1-x86_64.S
@@ -15,6 +15,14 @@ sha1_block_data_order:
jz .Lialu
testl $536870912,%r10d
jnz _shaext_shortcut
+ andl $296,%r10d
+ cmpl $296,%r10d
+ je _avx2_shortcut
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
jmp _ssse3_shortcut
.align 16
@@ -2606,6 +2614,2827 @@ _ssse3_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r14),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r14),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r14),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp .Lalign32_1
+.align 32
+.Lalign32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp .Lalign32_2
+.align 32
+.Lalign32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je .Ldone_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp .Lalign32_3
+.align 32
+.Lalign32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
diff --git a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
index 63dca42029ea..1c77e3d13a8b 100644
--- a/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S
@@ -12,6 +12,8 @@ sha256_multi_block:
movq OPENSSL_ia32cap_P+4(%rip),%rcx
btq $61,%rcx
jc _shaext_shortcut
+ testl $268435456,%ecx
+ jnz _avx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -3125,6 +3127,4676 @@ _shaext_shortcut:
.byte 0xf3,0xc3
.cfi_endproc
.size sha256_multi_block_shaext,.-sha256_multi_block_shaext
+.type sha256_multi_block_avx,@function
+.align 32
+sha256_multi_block_avx:
+.cfi_startproc
+_avx_shortcut:
+ shrq $32,%rcx
+ cmpl $2,%edx
+ jb .Lavx
+ testl $32,%ecx
+ jnz _avx2_shortcut
+ jmp .Lavx
+.align 32
+.Lavx:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ subq $288,%rsp
+ andq $-256,%rsp
+ movq %rax,272(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08
+.Lbody_avx:
+ leaq K256+128(%rip),%rbp
+ leaq 256(%rsp),%rbx
+ leaq 128(%rdi),%rdi
+
+.Loop_grande_avx:
+ movl %edx,280(%rsp)
+ xorl %edx,%edx
+ movq 0(%rsi),%r8
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r8
+ movq 16(%rsi),%r9
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r9
+ movq 32(%rsi),%r10
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r10
+ movq 48(%rsi),%r11
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r11
+ testl %edx,%edx
+ jz .Ldone_avx
+
+ vmovdqu 0-128(%rdi),%xmm8
+ leaq 128(%rsp),%rax
+ vmovdqu 32-128(%rdi),%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ vmovdqu 96-128(%rdi),%xmm11
+ vmovdqu 128-128(%rdi),%xmm12
+ vmovdqu 160-128(%rdi),%xmm13
+ vmovdqu 192-128(%rdi),%xmm14
+ vmovdqu 224-128(%rdi),%xmm15
+ vmovdqu .Lpbswap(%rip),%xmm6
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ vpxor %xmm9,%xmm10,%xmm4
+ vmovd 0(%r8),%xmm5
+ vmovd 0(%r9),%xmm0
+ vpinsrd $1,0(%r10),%xmm5,%xmm5
+ vpinsrd $1,0(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,0-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovd 4(%r8),%xmm5
+ vmovd 4(%r9),%xmm0
+ vpinsrd $1,4(%r10),%xmm5,%xmm5
+ vpinsrd $1,4(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm5,16-128(%rax)
+ vpaddd %xmm14,%xmm5,%xmm5
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm5,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovd 8(%r8),%xmm5
+ vmovd 8(%r9),%xmm0
+ vpinsrd $1,8(%r10),%xmm5,%xmm5
+ vpinsrd $1,8(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,32-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovd 12(%r8),%xmm5
+ vmovd 12(%r9),%xmm0
+ vpinsrd $1,12(%r10),%xmm5,%xmm5
+ vpinsrd $1,12(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm5,48-128(%rax)
+ vpaddd %xmm12,%xmm5,%xmm5
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm5,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovd 16(%r8),%xmm5
+ vmovd 16(%r9),%xmm0
+ vpinsrd $1,16(%r10),%xmm5,%xmm5
+ vpinsrd $1,16(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,64-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovd 20(%r8),%xmm5
+ vmovd 20(%r9),%xmm0
+ vpinsrd $1,20(%r10),%xmm5,%xmm5
+ vpinsrd $1,20(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm5,80-128(%rax)
+ vpaddd %xmm10,%xmm5,%xmm5
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm5,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovd 24(%r8),%xmm5
+ vmovd 24(%r9),%xmm0
+ vpinsrd $1,24(%r10),%xmm5,%xmm5
+ vpinsrd $1,24(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,96-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovd 28(%r8),%xmm5
+ vmovd 28(%r9),%xmm0
+ vpinsrd $1,28(%r10),%xmm5,%xmm5
+ vpinsrd $1,28(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm5,112-128(%rax)
+ vpaddd %xmm8,%xmm5,%xmm5
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm5,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ vmovd 32(%r8),%xmm5
+ vmovd 32(%r9),%xmm0
+ vpinsrd $1,32(%r10),%xmm5,%xmm5
+ vpinsrd $1,32(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,128-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovd 36(%r8),%xmm5
+ vmovd 36(%r9),%xmm0
+ vpinsrd $1,36(%r10),%xmm5,%xmm5
+ vpinsrd $1,36(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm5,144-128(%rax)
+ vpaddd %xmm14,%xmm5,%xmm5
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm5,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovd 40(%r8),%xmm5
+ vmovd 40(%r9),%xmm0
+ vpinsrd $1,40(%r10),%xmm5,%xmm5
+ vpinsrd $1,40(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,160-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovd 44(%r8),%xmm5
+ vmovd 44(%r9),%xmm0
+ vpinsrd $1,44(%r10),%xmm5,%xmm5
+ vpinsrd $1,44(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm5,176-128(%rax)
+ vpaddd %xmm12,%xmm5,%xmm5
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm5,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovd 48(%r8),%xmm5
+ vmovd 48(%r9),%xmm0
+ vpinsrd $1,48(%r10),%xmm5,%xmm5
+ vpinsrd $1,48(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,192-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovd 52(%r8),%xmm5
+ vmovd 52(%r9),%xmm0
+ vpinsrd $1,52(%r10),%xmm5,%xmm5
+ vpinsrd $1,52(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm5,208-128(%rax)
+ vpaddd %xmm10,%xmm5,%xmm5
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm5,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovd 56(%r8),%xmm5
+ vmovd 56(%r9),%xmm0
+ vpinsrd $1,56(%r10),%xmm5,%xmm5
+ vpinsrd $1,56(%r11),%xmm0,%xmm0
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,224-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovd 60(%r8),%xmm5
+ leaq 64(%r8),%r8
+ vmovd 60(%r9),%xmm0
+ leaq 64(%r9),%r9
+ vpinsrd $1,60(%r10),%xmm5,%xmm5
+ leaq 64(%r10),%r10
+ vpinsrd $1,60(%r11),%xmm0,%xmm0
+ leaq 64(%r11),%r11
+ vpunpckldq %xmm0,%xmm5,%xmm5
+ vpshufb %xmm6,%xmm5,%xmm5
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm5,240-128(%rax)
+ vpaddd %xmm8,%xmm5,%xmm5
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ prefetcht0 63(%r8)
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+ prefetcht0 63(%r9)
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+ prefetcht0 63(%r10)
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+ prefetcht0 63(%r11)
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm5,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ vmovdqu 0-128(%rax),%xmm5
+ movl $3,%ecx
+ jmp .Loop_16_xx_avx
+.align 32
+.Loop_16_xx_avx:
+ vmovdqu 16-128(%rax),%xmm6
+ vpaddd 144-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 224-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,0-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovdqu 32-128(%rax),%xmm5
+ vpaddd 160-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 240-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm6,16-128(%rax)
+ vpaddd %xmm14,%xmm6,%xmm6
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovdqu 48-128(%rax),%xmm6
+ vpaddd 176-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 0-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,32-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovdqu 64-128(%rax),%xmm5
+ vpaddd 192-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 16-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm6,48-128(%rax)
+ vpaddd %xmm12,%xmm6,%xmm6
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm6,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovdqu 80-128(%rax),%xmm6
+ vpaddd 208-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 32-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,64-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovdqu 96-128(%rax),%xmm5
+ vpaddd 224-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 48-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm6,80-128(%rax)
+ vpaddd %xmm10,%xmm6,%xmm6
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovdqu 112-128(%rax),%xmm6
+ vpaddd 240-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 64-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,96-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovdqu 128-128(%rax),%xmm5
+ vpaddd 0-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 80-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm6,112-128(%rax)
+ vpaddd %xmm8,%xmm6,%xmm6
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ vmovdqu 144-128(%rax),%xmm6
+ vpaddd 16-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 96-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm12,%xmm7
+ vpslld $26,%xmm12,%xmm2
+ vmovdqu %xmm5,128-128(%rax)
+ vpaddd %xmm15,%xmm5,%xmm5
+
+ vpsrld $11,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm12,%xmm2
+ vpaddd -128(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm12,%xmm2
+ vpandn %xmm14,%xmm12,%xmm0
+ vpand %xmm13,%xmm12,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm8,%xmm15
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm8,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm8,%xmm9,%xmm3
+
+ vpxor %xmm1,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm8,%xmm1
+
+ vpslld $19,%xmm8,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm15,%xmm7
+
+ vpsrld $22,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm8,%xmm2
+ vpxor %xmm4,%xmm9,%xmm15
+ vpaddd %xmm5,%xmm11,%xmm11
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm15,%xmm15
+ vpaddd %xmm7,%xmm15,%xmm15
+ vmovdqu 160-128(%rax),%xmm5
+ vpaddd 32-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 112-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm11,%xmm7
+ vpslld $26,%xmm11,%xmm2
+ vmovdqu %xmm6,144-128(%rax)
+ vpaddd %xmm14,%xmm6,%xmm6
+
+ vpsrld $11,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm11,%xmm2
+ vpaddd -96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm11,%xmm2
+ vpandn %xmm13,%xmm11,%xmm0
+ vpand %xmm12,%xmm11,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm15,%xmm14
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm15,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm15,%xmm8,%xmm4
+
+ vpxor %xmm1,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm15,%xmm1
+
+ vpslld $19,%xmm15,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm14,%xmm7
+
+ vpsrld $22,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm15,%xmm2
+ vpxor %xmm3,%xmm8,%xmm14
+ vpaddd %xmm6,%xmm10,%xmm10
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm14,%xmm14
+ vpaddd %xmm7,%xmm14,%xmm14
+ vmovdqu 176-128(%rax),%xmm6
+ vpaddd 48-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 128-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm10,%xmm7
+ vpslld $26,%xmm10,%xmm2
+ vmovdqu %xmm5,160-128(%rax)
+ vpaddd %xmm13,%xmm5,%xmm5
+
+ vpsrld $11,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm10,%xmm2
+ vpaddd -64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm10,%xmm2
+ vpandn %xmm12,%xmm10,%xmm0
+ vpand %xmm11,%xmm10,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm14,%xmm13
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm14,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm14,%xmm15,%xmm3
+
+ vpxor %xmm1,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm14,%xmm1
+
+ vpslld $19,%xmm14,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm13,%xmm7
+
+ vpsrld $22,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm14,%xmm2
+ vpxor %xmm4,%xmm15,%xmm13
+ vpaddd %xmm5,%xmm9,%xmm9
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm13,%xmm13
+ vpaddd %xmm7,%xmm13,%xmm13
+ vmovdqu 192-128(%rax),%xmm5
+ vpaddd 64-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 144-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm9,%xmm7
+ vpslld $26,%xmm9,%xmm2
+ vmovdqu %xmm6,176-128(%rax)
+ vpaddd %xmm12,%xmm6,%xmm6
+
+ vpsrld $11,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm9,%xmm2
+ vpaddd -32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm9,%xmm2
+ vpandn %xmm11,%xmm9,%xmm0
+ vpand %xmm10,%xmm9,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm13,%xmm12
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm13,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm13,%xmm14,%xmm4
+
+ vpxor %xmm1,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm13,%xmm1
+
+ vpslld $19,%xmm13,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm12,%xmm7
+
+ vpsrld $22,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm13,%xmm2
+ vpxor %xmm3,%xmm14,%xmm12
+ vpaddd %xmm6,%xmm8,%xmm8
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm12,%xmm12
+ vpaddd %xmm7,%xmm12,%xmm12
+ vmovdqu 208-128(%rax),%xmm6
+ vpaddd 80-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 160-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm8,%xmm7
+ vpslld $26,%xmm8,%xmm2
+ vmovdqu %xmm5,192-128(%rax)
+ vpaddd %xmm11,%xmm5,%xmm5
+
+ vpsrld $11,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm8,%xmm2
+ vpaddd 0(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm8,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm8,%xmm2
+ vpandn %xmm10,%xmm8,%xmm0
+ vpand %xmm9,%xmm8,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm12,%xmm11
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm12,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm12,%xmm13,%xmm3
+
+ vpxor %xmm1,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm12,%xmm1
+
+ vpslld $19,%xmm12,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm11,%xmm7
+
+ vpsrld $22,%xmm12,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm12,%xmm2
+ vpxor %xmm4,%xmm13,%xmm11
+ vpaddd %xmm5,%xmm15,%xmm15
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm11,%xmm11
+ vpaddd %xmm7,%xmm11,%xmm11
+ vmovdqu 224-128(%rax),%xmm5
+ vpaddd 96-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 176-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm15,%xmm7
+ vpslld $26,%xmm15,%xmm2
+ vmovdqu %xmm6,208-128(%rax)
+ vpaddd %xmm10,%xmm6,%xmm6
+
+ vpsrld $11,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm15,%xmm2
+ vpaddd 32(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm15,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm15,%xmm2
+ vpandn %xmm9,%xmm15,%xmm0
+ vpand %xmm8,%xmm15,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm11,%xmm10
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm11,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm11,%xmm12,%xmm4
+
+ vpxor %xmm1,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm11,%xmm1
+
+ vpslld $19,%xmm11,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm10,%xmm7
+
+ vpsrld $22,%xmm11,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm11,%xmm2
+ vpxor %xmm3,%xmm12,%xmm10
+ vpaddd %xmm6,%xmm14,%xmm14
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm10,%xmm10
+ vpaddd %xmm7,%xmm10,%xmm10
+ vmovdqu 240-128(%rax),%xmm6
+ vpaddd 112-128(%rax),%xmm5,%xmm5
+
+ vpsrld $3,%xmm6,%xmm7
+ vpsrld $7,%xmm6,%xmm1
+ vpslld $25,%xmm6,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm6,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm6,%xmm2
+ vmovdqu 192-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm1,%xmm3,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpsrld $6,%xmm14,%xmm7
+ vpslld $26,%xmm14,%xmm2
+ vmovdqu %xmm5,224-128(%rax)
+ vpaddd %xmm9,%xmm5,%xmm5
+
+ vpsrld $11,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm14,%xmm2
+ vpaddd 64(%rbp),%xmm5,%xmm5
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm14,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm14,%xmm2
+ vpandn %xmm8,%xmm14,%xmm0
+ vpand %xmm15,%xmm14,%xmm3
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm10,%xmm9
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm10,%xmm1
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm10,%xmm11,%xmm3
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm5,%xmm5
+
+ vpsrld $13,%xmm10,%xmm1
+
+ vpslld $19,%xmm10,%xmm2
+ vpaddd %xmm0,%xmm5,%xmm5
+ vpand %xmm3,%xmm4,%xmm4
+
+ vpxor %xmm1,%xmm9,%xmm7
+
+ vpsrld $22,%xmm10,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm10,%xmm2
+ vpxor %xmm4,%xmm11,%xmm9
+ vpaddd %xmm5,%xmm13,%xmm13
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm5,%xmm9,%xmm9
+ vpaddd %xmm7,%xmm9,%xmm9
+ vmovdqu 0-128(%rax),%xmm5
+ vpaddd 128-128(%rax),%xmm6,%xmm6
+
+ vpsrld $3,%xmm5,%xmm7
+ vpsrld $7,%xmm5,%xmm1
+ vpslld $25,%xmm5,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $18,%xmm5,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $14,%xmm5,%xmm2
+ vmovdqu 208-128(%rax),%xmm0
+ vpsrld $10,%xmm0,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpsrld $17,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $15,%xmm0,%xmm2
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpxor %xmm1,%xmm4,%xmm7
+ vpsrld $19,%xmm0,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $13,%xmm0,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+ vpaddd %xmm7,%xmm6,%xmm6
+ vpsrld $6,%xmm13,%xmm7
+ vpslld $26,%xmm13,%xmm2
+ vmovdqu %xmm6,240-128(%rax)
+ vpaddd %xmm8,%xmm6,%xmm6
+
+ vpsrld $11,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpslld $21,%xmm13,%xmm2
+ vpaddd 96(%rbp),%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $25,%xmm13,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $7,%xmm13,%xmm2
+ vpandn %xmm15,%xmm13,%xmm0
+ vpand %xmm14,%xmm13,%xmm4
+
+ vpxor %xmm1,%xmm7,%xmm7
+
+ vpsrld $2,%xmm9,%xmm8
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $30,%xmm9,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm9,%xmm10,%xmm4
+
+ vpxor %xmm1,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm6,%xmm6
+
+ vpsrld $13,%xmm9,%xmm1
+
+ vpslld $19,%xmm9,%xmm2
+ vpaddd %xmm0,%xmm6,%xmm6
+ vpand %xmm4,%xmm3,%xmm3
+
+ vpxor %xmm1,%xmm8,%xmm7
+
+ vpsrld $22,%xmm9,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpslld $10,%xmm9,%xmm2
+ vpxor %xmm3,%xmm10,%xmm8
+ vpaddd %xmm6,%xmm12,%xmm12
+
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm2,%xmm7,%xmm7
+
+ vpaddd %xmm6,%xmm8,%xmm8
+ vpaddd %xmm7,%xmm8,%xmm8
+ addq $256,%rbp
+ decl %ecx
+ jnz .Loop_16_xx_avx
+
+ movl $1,%ecx
+ leaq K256+128(%rip),%rbp
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqa (%rbx),%xmm7
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovdqa %xmm7,%xmm6
+ vpcmpgtd %xmm0,%xmm6,%xmm6
+ vpaddd %xmm6,%xmm7,%xmm7
+
+ vmovdqu 0-128(%rdi),%xmm0
+ vpand %xmm6,%xmm8,%xmm8
+ vmovdqu 32-128(%rdi),%xmm1
+ vpand %xmm6,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm2
+ vpand %xmm6,%xmm10,%xmm10
+ vmovdqu 96-128(%rdi),%xmm5
+ vpand %xmm6,%xmm11,%xmm11
+ vpaddd %xmm0,%xmm8,%xmm8
+ vmovdqu 128-128(%rdi),%xmm0
+ vpand %xmm6,%xmm12,%xmm12
+ vpaddd %xmm1,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm1
+ vpand %xmm6,%xmm13,%xmm13
+ vpaddd %xmm2,%xmm10,%xmm10
+ vmovdqu 192-128(%rdi),%xmm2
+ vpand %xmm6,%xmm14,%xmm14
+ vpaddd %xmm5,%xmm11,%xmm11
+ vmovdqu 224-128(%rdi),%xmm5
+ vpand %xmm6,%xmm15,%xmm15
+ vpaddd %xmm0,%xmm12,%xmm12
+ vpaddd %xmm1,%xmm13,%xmm13
+ vmovdqu %xmm8,0-128(%rdi)
+ vpaddd %xmm2,%xmm14,%xmm14
+ vmovdqu %xmm9,32-128(%rdi)
+ vpaddd %xmm5,%xmm15,%xmm15
+ vmovdqu %xmm10,64-128(%rdi)
+ vmovdqu %xmm11,96-128(%rdi)
+ vmovdqu %xmm12,128-128(%rdi)
+ vmovdqu %xmm13,160-128(%rdi)
+ vmovdqu %xmm14,192-128(%rdi)
+ vmovdqu %xmm15,224-128(%rdi)
+
+ vmovdqu %xmm7,(%rbx)
+ vmovdqu .Lpbswap(%rip),%xmm6
+ decl %edx
+ jnz .Loop_avx
+
+ movl 280(%rsp),%edx
+ leaq 16(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+ decl %edx
+ jnz .Loop_grande_avx
+
+.Ldone_avx:
+ movq 272(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_multi_block_avx,.-sha256_multi_block_avx
+.type sha256_multi_block_avx2,@function
+.align 32
+sha256_multi_block_avx2:
+.cfi_startproc
+_avx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-256,%rsp
+ movq %rax,544(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08
+.Lbody_avx2:
+ leaq K256+128(%rip),%rbp
+ leaq 128(%rdi),%rdi
+
+.Loop_grande_avx2:
+ movl %edx,552(%rsp)
+ xorl %edx,%edx
+ leaq 512(%rsp),%rbx
+ movq 0(%rsi),%r12
+ movl 8(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,0(%rbx)
+ cmovleq %rbp,%r12
+ movq 16(%rsi),%r13
+ movl 24(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,4(%rbx)
+ cmovleq %rbp,%r13
+ movq 32(%rsi),%r14
+ movl 40(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,8(%rbx)
+ cmovleq %rbp,%r14
+ movq 48(%rsi),%r15
+ movl 56(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,12(%rbx)
+ cmovleq %rbp,%r15
+ movq 64(%rsi),%r8
+ movl 72(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,16(%rbx)
+ cmovleq %rbp,%r8
+ movq 80(%rsi),%r9
+ movl 88(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,20(%rbx)
+ cmovleq %rbp,%r9
+ movq 96(%rsi),%r10
+ movl 104(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,24(%rbx)
+ cmovleq %rbp,%r10
+ movq 112(%rsi),%r11
+ movl 120(%rsi),%ecx
+ cmpl %edx,%ecx
+ cmovgl %ecx,%edx
+ testl %ecx,%ecx
+ movl %ecx,28(%rbx)
+ cmovleq %rbp,%r11
+ vmovdqu 0-128(%rdi),%ymm8
+ leaq 128(%rsp),%rax
+ vmovdqu 32-128(%rdi),%ymm9
+ leaq 256+128(%rsp),%rbx
+ vmovdqu 64-128(%rdi),%ymm10
+ vmovdqu 96-128(%rdi),%ymm11
+ vmovdqu 128-128(%rdi),%ymm12
+ vmovdqu 160-128(%rdi),%ymm13
+ vmovdqu 192-128(%rdi),%ymm14
+ vmovdqu 224-128(%rdi),%ymm15
+ vmovdqu .Lpbswap(%rip),%ymm6
+ jmp .Loop_avx2
+
+.align 32
+.Loop_avx2:
+ vpxor %ymm9,%ymm10,%ymm4
+ vmovd 0(%r12),%xmm5
+ vmovd 0(%r8),%xmm0
+ vmovd 0(%r13),%xmm1
+ vmovd 0(%r9),%xmm2
+ vpinsrd $1,0(%r14),%xmm5,%xmm5
+ vpinsrd $1,0(%r10),%xmm0,%xmm0
+ vpinsrd $1,0(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,0(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,0-128(%rax)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovd 4(%r12),%xmm5
+ vmovd 4(%r8),%xmm0
+ vmovd 4(%r13),%xmm1
+ vmovd 4(%r9),%xmm2
+ vpinsrd $1,4(%r14),%xmm5,%xmm5
+ vpinsrd $1,4(%r10),%xmm0,%xmm0
+ vpinsrd $1,4(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,4(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm5,32-128(%rax)
+ vpaddd %ymm14,%ymm5,%ymm5
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm5,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovd 8(%r12),%xmm5
+ vmovd 8(%r8),%xmm0
+ vmovd 8(%r13),%xmm1
+ vmovd 8(%r9),%xmm2
+ vpinsrd $1,8(%r14),%xmm5,%xmm5
+ vpinsrd $1,8(%r10),%xmm0,%xmm0
+ vpinsrd $1,8(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,8(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,64-128(%rax)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovd 12(%r12),%xmm5
+ vmovd 12(%r8),%xmm0
+ vmovd 12(%r13),%xmm1
+ vmovd 12(%r9),%xmm2
+ vpinsrd $1,12(%r14),%xmm5,%xmm5
+ vpinsrd $1,12(%r10),%xmm0,%xmm0
+ vpinsrd $1,12(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,12(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm5,96-128(%rax)
+ vpaddd %ymm12,%ymm5,%ymm5
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm5,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovd 16(%r12),%xmm5
+ vmovd 16(%r8),%xmm0
+ vmovd 16(%r13),%xmm1
+ vmovd 16(%r9),%xmm2
+ vpinsrd $1,16(%r14),%xmm5,%xmm5
+ vpinsrd $1,16(%r10),%xmm0,%xmm0
+ vpinsrd $1,16(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,16(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,128-128(%rax)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovd 20(%r12),%xmm5
+ vmovd 20(%r8),%xmm0
+ vmovd 20(%r13),%xmm1
+ vmovd 20(%r9),%xmm2
+ vpinsrd $1,20(%r14),%xmm5,%xmm5
+ vpinsrd $1,20(%r10),%xmm0,%xmm0
+ vpinsrd $1,20(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,20(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm5,160-128(%rax)
+ vpaddd %ymm10,%ymm5,%ymm5
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm5,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovd 24(%r12),%xmm5
+ vmovd 24(%r8),%xmm0
+ vmovd 24(%r13),%xmm1
+ vmovd 24(%r9),%xmm2
+ vpinsrd $1,24(%r14),%xmm5,%xmm5
+ vpinsrd $1,24(%r10),%xmm0,%xmm0
+ vpinsrd $1,24(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,24(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,192-128(%rax)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovd 28(%r12),%xmm5
+ vmovd 28(%r8),%xmm0
+ vmovd 28(%r13),%xmm1
+ vmovd 28(%r9),%xmm2
+ vpinsrd $1,28(%r14),%xmm5,%xmm5
+ vpinsrd $1,28(%r10),%xmm0,%xmm0
+ vpinsrd $1,28(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,28(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm5,224-128(%rax)
+ vpaddd %ymm8,%ymm5,%ymm5
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm9,%ymm1
+
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm5,%ymm12,%ymm12
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ vmovd 32(%r12),%xmm5
+ vmovd 32(%r8),%xmm0
+ vmovd 32(%r13),%xmm1
+ vmovd 32(%r9),%xmm2
+ vpinsrd $1,32(%r14),%xmm5,%xmm5
+ vpinsrd $1,32(%r10),%xmm0,%xmm0
+ vpinsrd $1,32(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,32(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,256-256-128(%rbx)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovd 36(%r12),%xmm5
+ vmovd 36(%r8),%xmm0
+ vmovd 36(%r13),%xmm1
+ vmovd 36(%r9),%xmm2
+ vpinsrd $1,36(%r14),%xmm5,%xmm5
+ vpinsrd $1,36(%r10),%xmm0,%xmm0
+ vpinsrd $1,36(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,36(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm5,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm5,%ymm5
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm5,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovd 40(%r12),%xmm5
+ vmovd 40(%r8),%xmm0
+ vmovd 40(%r13),%xmm1
+ vmovd 40(%r9),%xmm2
+ vpinsrd $1,40(%r14),%xmm5,%xmm5
+ vpinsrd $1,40(%r10),%xmm0,%xmm0
+ vpinsrd $1,40(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,40(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovd 44(%r12),%xmm5
+ vmovd 44(%r8),%xmm0
+ vmovd 44(%r13),%xmm1
+ vmovd 44(%r9),%xmm2
+ vpinsrd $1,44(%r14),%xmm5,%xmm5
+ vpinsrd $1,44(%r10),%xmm0,%xmm0
+ vpinsrd $1,44(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,44(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm5,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm5,%ymm5
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm5,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovd 48(%r12),%xmm5
+ vmovd 48(%r8),%xmm0
+ vmovd 48(%r13),%xmm1
+ vmovd 48(%r9),%xmm2
+ vpinsrd $1,48(%r14),%xmm5,%xmm5
+ vpinsrd $1,48(%r10),%xmm0,%xmm0
+ vpinsrd $1,48(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,48(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,384-256-128(%rbx)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovd 52(%r12),%xmm5
+ vmovd 52(%r8),%xmm0
+ vmovd 52(%r13),%xmm1
+ vmovd 52(%r9),%xmm2
+ vpinsrd $1,52(%r14),%xmm5,%xmm5
+ vpinsrd $1,52(%r10),%xmm0,%xmm0
+ vpinsrd $1,52(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,52(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm5,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm5,%ymm5
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm5,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovd 56(%r12),%xmm5
+ vmovd 56(%r8),%xmm0
+ vmovd 56(%r13),%xmm1
+ vmovd 56(%r9),%xmm2
+ vpinsrd $1,56(%r14),%xmm5,%xmm5
+ vpinsrd $1,56(%r10),%xmm0,%xmm0
+ vpinsrd $1,56(%r15),%xmm1,%xmm1
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,56(%r11),%xmm2,%xmm2
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,448-256-128(%rbx)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovd 60(%r12),%xmm5
+ leaq 64(%r12),%r12
+ vmovd 60(%r8),%xmm0
+ leaq 64(%r8),%r8
+ vmovd 60(%r13),%xmm1
+ leaq 64(%r13),%r13
+ vmovd 60(%r9),%xmm2
+ leaq 64(%r9),%r9
+ vpinsrd $1,60(%r14),%xmm5,%xmm5
+ leaq 64(%r14),%r14
+ vpinsrd $1,60(%r10),%xmm0,%xmm0
+ leaq 64(%r10),%r10
+ vpinsrd $1,60(%r15),%xmm1,%xmm1
+ leaq 64(%r15),%r15
+ vpunpckldq %ymm1,%ymm5,%ymm5
+ vpinsrd $1,60(%r11),%xmm2,%xmm2
+ leaq 64(%r11),%r11
+ vpunpckldq %ymm2,%ymm0,%ymm0
+ vinserti128 $1,%xmm0,%ymm5,%ymm5
+ vpshufb %ymm6,%ymm5,%ymm5
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm5,480-256-128(%rbx)
+ vpaddd %ymm8,%ymm5,%ymm5
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ prefetcht0 63(%r12)
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+ prefetcht0 63(%r13)
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+ prefetcht0 63(%r14)
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+ prefetcht0 63(%r15)
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm9,%ymm1
+ prefetcht0 63(%r8)
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm4,%ymm3,%ymm3
+ prefetcht0 63(%r9)
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ prefetcht0 63(%r10)
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm5,%ymm12,%ymm12
+ prefetcht0 63(%r11)
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ vmovdqu 0-128(%rax),%ymm5
+ movl $3,%ecx
+ jmp .Loop_16_xx_avx2
+.align 32
+.Loop_16_xx_avx2:
+ vmovdqu 32-128(%rax),%ymm6
+ vpaddd 288-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 448-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,0-128(%rax)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovdqu 64-128(%rax),%ymm5
+ vpaddd 320-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 480-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm6,32-128(%rax)
+ vpaddd %ymm14,%ymm6,%ymm6
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm6,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovdqu 96-128(%rax),%ymm6
+ vpaddd 352-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 0-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,64-128(%rax)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovdqu 128-128(%rax),%ymm5
+ vpaddd 384-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 32-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm6,96-128(%rax)
+ vpaddd %ymm12,%ymm6,%ymm6
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm6,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovdqu 160-128(%rax),%ymm6
+ vpaddd 416-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 64-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,128-128(%rax)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovdqu 192-128(%rax),%ymm5
+ vpaddd 448-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 96-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm6,160-128(%rax)
+ vpaddd %ymm10,%ymm6,%ymm6
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm6,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovdqu 224-128(%rax),%ymm6
+ vpaddd 480-256-128(%rbx),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 128-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,192-128(%rax)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovdqu 256-256-128(%rbx),%ymm5
+ vpaddd 0-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 160-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm6,224-128(%rax)
+ vpaddd %ymm8,%ymm6,%ymm6
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm9,%ymm1
+
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm6,%ymm12,%ymm12
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ vmovdqu 288-256-128(%rbx),%ymm6
+ vpaddd 32-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 192-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm12,%ymm7
+ vpslld $26,%ymm12,%ymm2
+ vmovdqu %ymm5,256-256-128(%rbx)
+ vpaddd %ymm15,%ymm5,%ymm5
+
+ vpsrld $11,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm12,%ymm2
+ vpaddd -128(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm12,%ymm2
+ vpandn %ymm14,%ymm12,%ymm0
+ vpand %ymm13,%ymm12,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm8,%ymm15
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm8,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm8,%ymm9,%ymm3
+
+ vpxor %ymm1,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm8,%ymm1
+
+ vpslld $19,%ymm8,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm15,%ymm7
+
+ vpsrld $22,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm8,%ymm2
+ vpxor %ymm4,%ymm9,%ymm15
+ vpaddd %ymm5,%ymm11,%ymm11
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm15,%ymm15
+ vpaddd %ymm7,%ymm15,%ymm15
+ vmovdqu 320-256-128(%rbx),%ymm5
+ vpaddd 64-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 224-128(%rax),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm11,%ymm7
+ vpslld $26,%ymm11,%ymm2
+ vmovdqu %ymm6,288-256-128(%rbx)
+ vpaddd %ymm14,%ymm6,%ymm6
+
+ vpsrld $11,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm11,%ymm2
+ vpaddd -96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm11,%ymm2
+ vpandn %ymm13,%ymm11,%ymm0
+ vpand %ymm12,%ymm11,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm15,%ymm14
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm15,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm15,%ymm8,%ymm4
+
+ vpxor %ymm1,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm15,%ymm1
+
+ vpslld $19,%ymm15,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm14,%ymm7
+
+ vpsrld $22,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm15,%ymm2
+ vpxor %ymm3,%ymm8,%ymm14
+ vpaddd %ymm6,%ymm10,%ymm10
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm14,%ymm14
+ vpaddd %ymm7,%ymm14,%ymm14
+ vmovdqu 352-256-128(%rbx),%ymm6
+ vpaddd 96-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 256-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm10,%ymm7
+ vpslld $26,%ymm10,%ymm2
+ vmovdqu %ymm5,320-256-128(%rbx)
+ vpaddd %ymm13,%ymm5,%ymm5
+
+ vpsrld $11,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm10,%ymm2
+ vpaddd -64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm10,%ymm2
+ vpandn %ymm12,%ymm10,%ymm0
+ vpand %ymm11,%ymm10,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm14,%ymm13
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm14,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm14,%ymm15,%ymm3
+
+ vpxor %ymm1,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm14,%ymm1
+
+ vpslld $19,%ymm14,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm13,%ymm7
+
+ vpsrld $22,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm14,%ymm2
+ vpxor %ymm4,%ymm15,%ymm13
+ vpaddd %ymm5,%ymm9,%ymm9
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpaddd %ymm7,%ymm13,%ymm13
+ vmovdqu 384-256-128(%rbx),%ymm5
+ vpaddd 128-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 288-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm9,%ymm7
+ vpslld $26,%ymm9,%ymm2
+ vmovdqu %ymm6,352-256-128(%rbx)
+ vpaddd %ymm12,%ymm6,%ymm6
+
+ vpsrld $11,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm9,%ymm2
+ vpaddd -32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm9,%ymm2
+ vpandn %ymm11,%ymm9,%ymm0
+ vpand %ymm10,%ymm9,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm13,%ymm12
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm13,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm13,%ymm14,%ymm4
+
+ vpxor %ymm1,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm13,%ymm1
+
+ vpslld $19,%ymm13,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm12,%ymm7
+
+ vpsrld $22,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm13,%ymm2
+ vpxor %ymm3,%ymm14,%ymm12
+ vpaddd %ymm6,%ymm8,%ymm8
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpaddd %ymm7,%ymm12,%ymm12
+ vmovdqu 416-256-128(%rbx),%ymm6
+ vpaddd 160-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 320-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm8,%ymm7
+ vpslld $26,%ymm8,%ymm2
+ vmovdqu %ymm5,384-256-128(%rbx)
+ vpaddd %ymm11,%ymm5,%ymm5
+
+ vpsrld $11,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm8,%ymm2
+ vpaddd 0(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm8,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm8,%ymm2
+ vpandn %ymm10,%ymm8,%ymm0
+ vpand %ymm9,%ymm8,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm12,%ymm11
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm12,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm12,%ymm13,%ymm3
+
+ vpxor %ymm1,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm12,%ymm1
+
+ vpslld $19,%ymm12,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm11,%ymm7
+
+ vpsrld $22,%ymm12,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm12,%ymm2
+ vpxor %ymm4,%ymm13,%ymm11
+ vpaddd %ymm5,%ymm15,%ymm15
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm11,%ymm11
+ vpaddd %ymm7,%ymm11,%ymm11
+ vmovdqu 448-256-128(%rbx),%ymm5
+ vpaddd 192-128(%rax),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 352-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm15,%ymm7
+ vpslld $26,%ymm15,%ymm2
+ vmovdqu %ymm6,416-256-128(%rbx)
+ vpaddd %ymm10,%ymm6,%ymm6
+
+ vpsrld $11,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm15,%ymm2
+ vpaddd 32(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm15,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm15,%ymm2
+ vpandn %ymm9,%ymm15,%ymm0
+ vpand %ymm8,%ymm15,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm11,%ymm10
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm11,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm11,%ymm12,%ymm4
+
+ vpxor %ymm1,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm11,%ymm1
+
+ vpslld $19,%ymm11,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm10,%ymm7
+
+ vpsrld $22,%ymm11,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm11,%ymm2
+ vpxor %ymm3,%ymm12,%ymm10
+ vpaddd %ymm6,%ymm14,%ymm14
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm10,%ymm10
+ vpaddd %ymm7,%ymm10,%ymm10
+ vmovdqu 480-256-128(%rbx),%ymm6
+ vpaddd 224-128(%rax),%ymm5,%ymm5
+
+ vpsrld $3,%ymm6,%ymm7
+ vpsrld $7,%ymm6,%ymm1
+ vpslld $25,%ymm6,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm6,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm6,%ymm2
+ vmovdqu 384-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpxor %ymm1,%ymm3,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpsrld $6,%ymm14,%ymm7
+ vpslld $26,%ymm14,%ymm2
+ vmovdqu %ymm5,448-256-128(%rbx)
+ vpaddd %ymm9,%ymm5,%ymm5
+
+ vpsrld $11,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm14,%ymm2
+ vpaddd 64(%rbp),%ymm5,%ymm5
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm14,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm14,%ymm2
+ vpandn %ymm8,%ymm14,%ymm0
+ vpand %ymm15,%ymm14,%ymm3
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm10,%ymm9
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm10,%ymm1
+ vpxor %ymm3,%ymm0,%ymm0
+ vpxor %ymm10,%ymm11,%ymm3
+
+ vpxor %ymm1,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm5,%ymm5
+
+ vpsrld $13,%ymm10,%ymm1
+
+ vpslld $19,%ymm10,%ymm2
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpand %ymm3,%ymm4,%ymm4
+
+ vpxor %ymm1,%ymm9,%ymm7
+
+ vpsrld $22,%ymm10,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm10,%ymm2
+ vpxor %ymm4,%ymm11,%ymm9
+ vpaddd %ymm5,%ymm13,%ymm13
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm5,%ymm9,%ymm9
+ vpaddd %ymm7,%ymm9,%ymm9
+ vmovdqu 0-128(%rax),%ymm5
+ vpaddd 256-256-128(%rbx),%ymm6,%ymm6
+
+ vpsrld $3,%ymm5,%ymm7
+ vpsrld $7,%ymm5,%ymm1
+ vpslld $25,%ymm5,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $18,%ymm5,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $14,%ymm5,%ymm2
+ vmovdqu 416-256-128(%rbx),%ymm0
+ vpsrld $10,%ymm0,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpsrld $17,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $15,%ymm0,%ymm2
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm1,%ymm4,%ymm7
+ vpsrld $19,%ymm0,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $13,%ymm0,%ymm2
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpsrld $6,%ymm13,%ymm7
+ vpslld $26,%ymm13,%ymm2
+ vmovdqu %ymm6,480-256-128(%rbx)
+ vpaddd %ymm8,%ymm6,%ymm6
+
+ vpsrld $11,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+ vpslld $21,%ymm13,%ymm2
+ vpaddd 96(%rbp),%ymm6,%ymm6
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $25,%ymm13,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $7,%ymm13,%ymm2
+ vpandn %ymm15,%ymm13,%ymm0
+ vpand %ymm14,%ymm13,%ymm4
+
+ vpxor %ymm1,%ymm7,%ymm7
+
+ vpsrld $2,%ymm9,%ymm8
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $30,%ymm9,%ymm1
+ vpxor %ymm4,%ymm0,%ymm0
+ vpxor %ymm9,%ymm10,%ymm4
+
+ vpxor %ymm1,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm6,%ymm6
+
+ vpsrld $13,%ymm9,%ymm1
+
+ vpslld $19,%ymm9,%ymm2
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpand %ymm4,%ymm3,%ymm3
+
+ vpxor %ymm1,%ymm8,%ymm7
+
+ vpsrld $22,%ymm9,%ymm1
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpslld $10,%ymm9,%ymm2
+ vpxor %ymm3,%ymm10,%ymm8
+ vpaddd %ymm6,%ymm12,%ymm12
+
+ vpxor %ymm1,%ymm7,%ymm7
+ vpxor %ymm2,%ymm7,%ymm7
+
+ vpaddd %ymm6,%ymm8,%ymm8
+ vpaddd %ymm7,%ymm8,%ymm8
+ addq $256,%rbp
+ decl %ecx
+ jnz .Loop_16_xx_avx2
+
+ movl $1,%ecx
+ leaq 512(%rsp),%rbx
+ leaq K256+128(%rip),%rbp
+ cmpl 0(%rbx),%ecx
+ cmovgeq %rbp,%r12
+ cmpl 4(%rbx),%ecx
+ cmovgeq %rbp,%r13
+ cmpl 8(%rbx),%ecx
+ cmovgeq %rbp,%r14
+ cmpl 12(%rbx),%ecx
+ cmovgeq %rbp,%r15
+ cmpl 16(%rbx),%ecx
+ cmovgeq %rbp,%r8
+ cmpl 20(%rbx),%ecx
+ cmovgeq %rbp,%r9
+ cmpl 24(%rbx),%ecx
+ cmovgeq %rbp,%r10
+ cmpl 28(%rbx),%ecx
+ cmovgeq %rbp,%r11
+ vmovdqa (%rbx),%ymm7
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqa %ymm7,%ymm6
+ vpcmpgtd %ymm0,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm7,%ymm7
+
+ vmovdqu 0-128(%rdi),%ymm0
+ vpand %ymm6,%ymm8,%ymm8
+ vmovdqu 32-128(%rdi),%ymm1
+ vpand %ymm6,%ymm9,%ymm9
+ vmovdqu 64-128(%rdi),%ymm2
+ vpand %ymm6,%ymm10,%ymm10
+ vmovdqu 96-128(%rdi),%ymm5
+ vpand %ymm6,%ymm11,%ymm11
+ vpaddd %ymm0,%ymm8,%ymm8
+ vmovdqu 128-128(%rdi),%ymm0
+ vpand %ymm6,%ymm12,%ymm12
+ vpaddd %ymm1,%ymm9,%ymm9
+ vmovdqu 160-128(%rdi),%ymm1
+ vpand %ymm6,%ymm13,%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vmovdqu 192-128(%rdi),%ymm2
+ vpand %ymm6,%ymm14,%ymm14
+ vpaddd %ymm5,%ymm11,%ymm11
+ vmovdqu 224-128(%rdi),%ymm5
+ vpand %ymm6,%ymm15,%ymm15
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpaddd %ymm1,%ymm13,%ymm13
+ vmovdqu %ymm8,0-128(%rdi)
+ vpaddd %ymm2,%ymm14,%ymm14
+ vmovdqu %ymm9,32-128(%rdi)
+ vpaddd %ymm5,%ymm15,%ymm15
+ vmovdqu %ymm10,64-128(%rdi)
+ vmovdqu %ymm11,96-128(%rdi)
+ vmovdqu %ymm12,128-128(%rdi)
+ vmovdqu %ymm13,160-128(%rdi)
+ vmovdqu %ymm14,192-128(%rdi)
+ vmovdqu %ymm15,224-128(%rdi)
+
+ vmovdqu %ymm7,(%rbx)
+ leaq 256+128(%rsp),%rbx
+ vmovdqu .Lpbswap(%rip),%ymm6
+ decl %edx
+ jnz .Loop_avx2
+
+
+
+
+
+
+
+.Ldone_avx2:
+ movq 544(%rsp),%rax
+.cfi_def_cfa %rax,8
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_multi_block_avx2,.-sha256_multi_block_avx2
.align 256
K256:
.long 1116352408,1116352408,1116352408,1116352408
diff --git a/secure/lib/libcrypto/amd64/sha256-x86_64.S b/secure/lib/libcrypto/amd64/sha256-x86_64.S
index 91b3ead8976a..13d497c4f8e0 100644
--- a/secure/lib/libcrypto/amd64/sha256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha256-x86_64.S
@@ -14,6 +14,14 @@ sha256_block_data_order:
movl 8(%r11),%r11d
testl $536870912,%r11d
jnz _shaext_shortcut
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je .Lavx2_shortcut
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
testl $512,%r10d
jnz .Lssse3_shortcut
movq %rsp,%rax
@@ -3087,3 +3095,2364 @@ sha256_block_data_order_ssse3:
.byte 0xf3,0xc3
.cfi_endproc
.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type sha256_block_data_order_avx,@function
+.align 64
+sha256_block_data_order_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_avx
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
+.type sha256_block_data_order_avx2,@function
+.align 64
+sha256_block_data_order_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $544,%rsp
+ shlq $4,%rdx
+ andq $-1024,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ addq $448,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+
+ vzeroupper
+ subq $-64,%rsi
+ movl 0(%rdi),%eax
+ movq %rsi,%r12
+ movl 4(%rdi),%ebx
+ cmpq %rdx,%rsi
+ movl 8(%rdi),%ecx
+ cmoveq %rsp,%r12
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%ymm8
+ vmovdqa K256+512+64(%rip),%ymm9
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa K256+512(%rip),%ymm7
+ vmovdqu -64+0(%rsi),%xmm0
+ vmovdqu -64+16(%rsi),%xmm1
+ vmovdqu -64+32(%rsi),%xmm2
+ vmovdqu -64+48(%rsi),%xmm3
+
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm7,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm7,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+
+ leaq K256(%rip),%rbp
+ vpshufb %ymm7,%ymm2,%ymm2
+ vpaddd 0(%rbp),%ymm0,%ymm4
+ vpshufb %ymm7,%ymm3,%ymm3
+ vpaddd 32(%rbp),%ymm1,%ymm5
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ vpaddd 96(%rbp),%ymm3,%ymm7
+ vmovdqa %ymm4,0(%rsp)
+ xorl %r14d,%r14d
+ vmovdqa %ymm5,32(%rsp)
+
+ movq 88(%rsp),%rdi
+.cfi_def_cfa %rdi,8
+ leaq -64(%rsp),%rsp
+
+
+
+ movq %rdi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ movl %ebx,%edi
+ vmovdqa %ymm6,0(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %ymm7,32(%rsp)
+ movl %r9d,%r12d
+ subq $-32*4,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm0,%ymm1,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm2,%ymm3,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm0,%ymm0
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ vpshufd $250,%ymm3,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm0,%ymm0
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpshufd $80,%ymm0,%ymm7
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ vpaddd 0(%rbp),%ymm0,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm1,%ymm2,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm3,%ymm0,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm1,%ymm1
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ vpshufd $250,%ymm0,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm1,%ymm1
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpshufd $80,%ymm1,%ymm7
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ vpaddd 32(%rbp),%ymm1,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm2,%ymm3,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm0,%ymm1,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm2,%ymm2
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ vpshufd $250,%ymm1,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm2,%ymm2
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpshufd $80,%ymm2,%ymm7
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm3,%ymm0,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm1,%ymm2,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm3,%ymm3
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ vpshufd $250,%ymm2,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm3,%ymm3
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufb %ymm8,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpshufd $80,%ymm3,%ymm7
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpsrld $10,%ymm7,%ymm6
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ vpsrlq $17,%ymm7,%ymm7
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpshufb %ymm9,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ vpaddd 96(%rbp),%ymm3,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq 128(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jne .Lavx2_00_47
+ addl 0+64(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+64(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+64(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+64(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+64(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36+64(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+64(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+64(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ addl 0(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ movq 512(%rsp),%rdi
+ addl %r14d,%eax
+
+ leaq 448(%rsp),%rbp
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+
+ cmpq 80(%rbp),%rsi
+ je .Ldone_avx2
+
+ xorl %r14d,%r14d
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ movl %r9d,%r12d
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %ebx,%edi
+ xorl %r13d,%r14d
+ leal (%r11,%rdi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%edi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%edi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%edi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r11d,%edi
+ xorl %r13d,%r14d
+ leal (%r9,%rdi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%edi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%edi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%edi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %r9d,%edi
+ xorl %r13d,%r14d
+ leal (%rdx,%rdi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%edi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%edi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%edi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%edi
+ xorl %r12d,%r14d
+ xorl %edx,%edi
+ xorl %r13d,%r14d
+ leal (%rbx,%rdi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%edi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %edi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%edi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%edi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %edi,%r15d
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ leaq -64(%rbp),%rbp
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 512(%rsp),%rdi
+ addl %r14d,%eax
+
+ leaq 448(%rsp),%rsp
+
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ leaq 128(%rsi),%rsi
+ addl 24(%rdi),%r10d
+ movq %rsi,%r12
+ addl 28(%rdi),%r11d
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ cmoveq %rsp,%r12
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+
+.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 88(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2
diff --git a/secure/lib/libcrypto/amd64/sha512-x86_64.S b/secure/lib/libcrypto/amd64/sha512-x86_64.S
index a9b971a1b7cd..ae11a36e4821 100644
--- a/secure/lib/libcrypto/amd64/sha512-x86_64.S
+++ b/secure/lib/libcrypto/amd64/sha512-x86_64.S
@@ -8,6 +8,20 @@
.align 16
sha512_block_data_order:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ testl $2048,%r10d
+ jnz .Lxop_shortcut
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je .Lavx2_shortcut
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
movq %rsp,%rax
.cfi_def_cfa_register %rax
pushq %rbx
@@ -1801,3 +1815,3649 @@ K512:
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.quad 0x0001020304050607,0x08090a0b0c0d0e0f
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha512_block_data_order_xop,@function
+.align 64
+sha512_block_data_order_xop:
+.cfi_startproc
+.Lxop_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_xop:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_xop
+.align 16
+.Lloop_xop:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm0,%xmm0
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,223,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm7,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm0,%xmm0
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm1,%xmm1
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,216,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm0,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm1,%xmm1
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm2,%xmm2
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,217,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm1,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm2,%xmm2
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm3,%xmm3
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,218,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm2,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm3,%xmm3
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ rorq $23,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rax,%r14
+ vpaddq %xmm11,%xmm4,%xmm4
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+.byte 143,72,120,195,209,7
+ xorq %r10,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,219,3
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm3,%xmm10
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %rdx,%r13
+ addq %r11,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r11
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpaddq %xmm11,%xmm4,%xmm4
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ rorq $23,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r10,%r14
+ vpaddq %xmm11,%xmm5,%xmm5
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+.byte 143,72,120,195,209,7
+ xorq %r8,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,220,3
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm4,%xmm10
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rbx,%r13
+ addq %r9,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%r9
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpaddq %xmm11,%xmm5,%xmm5
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ rorq $23,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %r8,%r14
+ vpaddq %xmm11,%xmm6,%xmm6
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+.byte 143,72,120,195,209,7
+ xorq %rcx,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,221,3
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm5,%xmm10
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %r11,%r13
+ addq %rdx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rdx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpaddq %xmm11,%xmm6,%xmm6
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ rorq $23,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ rorq $5,%r14
+.byte 143,72,120,195,200,56
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpsrlq $7,%xmm8,%xmm8
+ rorq $4,%r13
+ xorq %rcx,%r14
+ vpaddq %xmm11,%xmm7,%xmm7
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+.byte 143,72,120,195,209,7
+ xorq %rax,%r12
+ rorq $6,%r14
+ vpxor %xmm9,%xmm8,%xmm8
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+.byte 143,104,120,195,222,3
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ vpsrlq $6,%xmm6,%xmm10
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r9,%r13
+ addq %rbx,%r14
+.byte 143,72,120,195,203,42
+ rorq $23,%r13
+ movq %r14,%rbx
+ vpxor %xmm10,%xmm11,%xmm11
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm9,%xmm11,%xmm11
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpaddq %xmm11,%xmm7,%xmm7
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lxop_00_47
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ rorq $23,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ rorq $5,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ rorq $4,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ rorq $6,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ rorq $28,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ rorq $23,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ rorq $5,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ rorq $4,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ rorq $6,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ rorq $28,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ rorq $23,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ rorq $5,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ rorq $4,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ rorq $6,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ rorq $28,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ rorq $23,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ rorq $5,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ rorq $4,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ rorq $6,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ rorq $28,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ rorq $23,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ rorq $5,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ rorq $4,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ rorq $6,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ rorq $28,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ rorq $23,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ rorq $5,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ rorq $4,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ rorq $6,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ rorq $28,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ rorq $23,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ rorq $5,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ rorq $4,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ rorq $6,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ rorq $14,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ rorq $28,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ rorq $23,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ rorq $5,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ rorq $4,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ rorq $6,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ rorq $14,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ rorq $28,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_xop
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_xop:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_xop,.-sha512_block_data_order_xop
+.type sha512_block_data_order_avx,@function
+.align 64
+sha512_block_data_order_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lavx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_avx
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
+.type sha512_block_data_order_avx2,@function
+.align 64
+sha512_block_data_order_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $1312,%rsp
+ shlq $4,%rdx
+ andq $-2048,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ addq $1152,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx2:
+
+ vzeroupper
+ subq $-128,%rsi
+ movq 0(%rdi),%rax
+ movq %rsi,%r12
+ movq 8(%rdi),%rbx
+ cmpq %rdx,%rsi
+ movq 16(%rdi),%rcx
+ cmoveq %rsp,%r12
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqu -128(%rsi),%xmm0
+ vmovdqu -128+16(%rsi),%xmm1
+ vmovdqu -128+32(%rsi),%xmm2
+ leaq K512+128(%rip),%rbp
+ vmovdqu -128+48(%rsi),%xmm3
+ vmovdqu -128+64(%rsi),%xmm4
+ vmovdqu -128+80(%rsi),%xmm5
+ vmovdqu -128+96(%rsi),%xmm6
+ vmovdqu -128+112(%rsi),%xmm7
+
+ vmovdqa 1152(%rbp),%ymm10
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm10,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm10,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+ vpshufb %ymm10,%ymm2,%ymm2
+ vinserti128 $1,64(%r12),%ymm4,%ymm4
+ vpshufb %ymm10,%ymm3,%ymm3
+ vinserti128 $1,80(%r12),%ymm5,%ymm5
+ vpshufb %ymm10,%ymm4,%ymm4
+ vinserti128 $1,96(%r12),%ymm6,%ymm6
+ vpshufb %ymm10,%ymm5,%ymm5
+ vinserti128 $1,112(%r12),%ymm7,%ymm7
+
+ vpaddq -128(%rbp),%ymm0,%ymm8
+ vpshufb %ymm10,%ymm6,%ymm6
+ vpaddq -96(%rbp),%ymm1,%ymm9
+ vpshufb %ymm10,%ymm7,%ymm7
+ vpaddq -64(%rbp),%ymm2,%ymm10
+ vpaddq -32(%rbp),%ymm3,%ymm11
+ vmovdqa %ymm8,0(%rsp)
+ vpaddq 0(%rbp),%ymm4,%ymm8
+ vmovdqa %ymm9,32(%rsp)
+ vpaddq 32(%rbp),%ymm5,%ymm9
+ vmovdqa %ymm10,64(%rsp)
+ vpaddq 64(%rbp),%ymm6,%ymm10
+ vmovdqa %ymm11,96(%rsp)
+
+ movq 152(%rsp),%rdi
+.cfi_def_cfa %rdi,8
+ leaq -128(%rsp),%rsp
+
+
+
+ movq %rdi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpaddq 96(%rbp),%ymm7,%ymm11
+ vmovdqa %ymm8,0(%rsp)
+ xorq %r14,%r14
+ vmovdqa %ymm9,32(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %ymm10,64(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %ymm11,96(%rsp)
+ movq %r9,%r12
+ addq $32*8,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ leaq -128(%rsp),%rsp
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+
+ pushq 128-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ addq 0+256(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ vpalignr $8,%ymm4,%ymm5,%ymm11
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ vpaddq %ymm11,%ymm0,%ymm0
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ vpsrlq $6,%ymm7,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ vpsllq $3,%ymm7,%ymm10
+ vpaddq %ymm8,%ymm0,%ymm0
+ addq 8+256(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ vpsrlq $19,%ymm7,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ vpaddq %ymm11,%ymm0,%ymm0
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ vpaddq -128(%rbp),%ymm0,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ vmovdqa %ymm10,0(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ addq 32+256(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ vpalignr $8,%ymm5,%ymm6,%ymm11
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ vpsrlq $6,%ymm0,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ vpsllq $3,%ymm0,%ymm10
+ vpaddq %ymm8,%ymm1,%ymm1
+ addq 40+256(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ vpsrlq $19,%ymm0,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ vpaddq %ymm11,%ymm1,%ymm1
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ vpaddq -96(%rbp),%ymm1,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ vmovdqa %ymm10,32(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ addq 64+256(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ vpalignr $8,%ymm6,%ymm7,%ymm11
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ vpsrlq $6,%ymm1,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ vpsllq $3,%ymm1,%ymm10
+ vpaddq %ymm8,%ymm2,%ymm2
+ addq 72+256(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ vpsrlq $19,%ymm1,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ vpaddq %ymm11,%ymm2,%ymm2
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ vpaddq -64(%rbp),%ymm2,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ vmovdqa %ymm10,64(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ addq 96+256(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ vpalignr $8,%ymm7,%ymm0,%ymm11
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ vpsrlq $6,%ymm2,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ vpsllq $3,%ymm2,%ymm10
+ vpaddq %ymm8,%ymm3,%ymm3
+ addq 104+256(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ vpsrlq $19,%ymm2,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ vpaddq %ymm11,%ymm3,%ymm3
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ vpaddq -32(%rbp),%ymm3,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ vmovdqa %ymm10,96(%rsp)
+ leaq -128(%rsp),%rsp
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+
+ pushq 128-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ addq 0+256(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ vpalignr $8,%ymm0,%ymm1,%ymm11
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ vpsrlq $6,%ymm3,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ vpsllq $3,%ymm3,%ymm10
+ vpaddq %ymm8,%ymm4,%ymm4
+ addq 8+256(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ vpsrlq $19,%ymm3,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ vpaddq %ymm11,%ymm4,%ymm4
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ vpaddq 0(%rbp),%ymm4,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ vmovdqa %ymm10,0(%rsp)
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ addq 32+256(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ vpalignr $8,%ymm1,%ymm2,%ymm11
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ vpsrlq $6,%ymm4,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ vpsllq $3,%ymm4,%ymm10
+ vpaddq %ymm8,%ymm5,%ymm5
+ addq 40+256(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ vpsrlq $19,%ymm4,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ vpaddq %ymm11,%ymm5,%ymm5
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ vpaddq 32(%rbp),%ymm5,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ vmovdqa %ymm10,32(%rsp)
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ addq 64+256(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ vpalignr $8,%ymm2,%ymm3,%ymm11
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ vpsrlq $6,%ymm5,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ vpsllq $3,%ymm5,%ymm10
+ vpaddq %ymm8,%ymm6,%ymm6
+ addq 72+256(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ vpsrlq $19,%ymm5,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ vpaddq %ymm11,%ymm6,%ymm6
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ vpaddq 64(%rbp),%ymm6,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ vmovdqa %ymm10,64(%rsp)
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ addq 96+256(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ vpalignr $8,%ymm3,%ymm4,%ymm11
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ vpsrlq $1,%ymm8,%ymm10
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpsrlq $7,%ymm8,%ymm11
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ vpsllq $56,%ymm8,%ymm9
+ vpxor %ymm10,%ymm11,%ymm8
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ vpsrlq $7,%ymm10,%ymm10
+ vpxor %ymm9,%ymm8,%ymm8
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ vpsllq $7,%ymm9,%ymm9
+ vpxor %ymm10,%ymm8,%ymm8
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ vpsrlq $6,%ymm6,%ymm11
+ vpxor %ymm9,%ymm8,%ymm8
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ vpsllq $3,%ymm6,%ymm10
+ vpaddq %ymm8,%ymm7,%ymm7
+ addq 104+256(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ vpsrlq $19,%ymm6,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ vpsllq $42,%ymm10,%ymm10
+ vpxor %ymm9,%ymm11,%ymm11
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ vpsrlq $42,%ymm9,%ymm9
+ vpxor %ymm10,%ymm11,%ymm11
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ vpxor %ymm9,%ymm11,%ymm11
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ vpaddq %ymm11,%ymm7,%ymm7
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ vpaddq 96(%rbp),%ymm7,%ymm10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ vmovdqa %ymm10,96(%rsp)
+ leaq 256(%rbp),%rbp
+ cmpb $0,-121(%rbp)
+ jne .Lavx2_00_47
+ addq 0+128(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8+128(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32+128(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40+128(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64+128(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72+128(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96+128(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104+128(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ addq 0(%rsp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8(%rsp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32(%rsp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40(%rsp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64(%rsp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72(%rsp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96(%rsp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104(%rsp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ movq 1280(%rsp),%rdi
+ addq %r14,%rax
+
+ leaq 1152(%rsp),%rbp
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+
+ cmpq 144(%rbp),%rsi
+ je .Ldone_avx2
+
+ xorq %r14,%r14
+ movq %rbx,%rdi
+ xorq %rcx,%rdi
+ movq %r9,%r12
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ addq 0+16(%rbp),%r11
+ andq %r8,%r12
+ rorxq $41,%r8,%r13
+ rorxq $18,%r8,%r15
+ leaq (%rax,%r14,1),%rax
+ leaq (%r11,%r12,1),%r11
+ andnq %r10,%r8,%r12
+ xorq %r15,%r13
+ rorxq $14,%r8,%r14
+ leaq (%r11,%r12,1),%r11
+ xorq %r14,%r13
+ movq %rax,%r15
+ rorxq $39,%rax,%r12
+ leaq (%r11,%r13,1),%r11
+ xorq %rbx,%r15
+ rorxq $34,%rax,%r14
+ rorxq $28,%rax,%r13
+ leaq (%rdx,%r11,1),%rdx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rbx,%rdi
+ xorq %r13,%r14
+ leaq (%r11,%rdi,1),%r11
+ movq %r8,%r12
+ addq 8+16(%rbp),%r10
+ andq %rdx,%r12
+ rorxq $41,%rdx,%r13
+ rorxq $18,%rdx,%rdi
+ leaq (%r11,%r14,1),%r11
+ leaq (%r10,%r12,1),%r10
+ andnq %r9,%rdx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rdx,%r14
+ leaq (%r10,%r12,1),%r10
+ xorq %r14,%r13
+ movq %r11,%rdi
+ rorxq $39,%r11,%r12
+ leaq (%r10,%r13,1),%r10
+ xorq %rax,%rdi
+ rorxq $34,%r11,%r14
+ rorxq $28,%r11,%r13
+ leaq (%rcx,%r10,1),%rcx
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rax,%r15
+ xorq %r13,%r14
+ leaq (%r10,%r15,1),%r10
+ movq %rdx,%r12
+ addq 32+16(%rbp),%r9
+ andq %rcx,%r12
+ rorxq $41,%rcx,%r13
+ rorxq $18,%rcx,%r15
+ leaq (%r10,%r14,1),%r10
+ leaq (%r9,%r12,1),%r9
+ andnq %r8,%rcx,%r12
+ xorq %r15,%r13
+ rorxq $14,%rcx,%r14
+ leaq (%r9,%r12,1),%r9
+ xorq %r14,%r13
+ movq %r10,%r15
+ rorxq $39,%r10,%r12
+ leaq (%r9,%r13,1),%r9
+ xorq %r11,%r15
+ rorxq $34,%r10,%r14
+ rorxq $28,%r10,%r13
+ leaq (%rbx,%r9,1),%rbx
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r11,%rdi
+ xorq %r13,%r14
+ leaq (%r9,%rdi,1),%r9
+ movq %rcx,%r12
+ addq 40+16(%rbp),%r8
+ andq %rbx,%r12
+ rorxq $41,%rbx,%r13
+ rorxq $18,%rbx,%rdi
+ leaq (%r9,%r14,1),%r9
+ leaq (%r8,%r12,1),%r8
+ andnq %rdx,%rbx,%r12
+ xorq %rdi,%r13
+ rorxq $14,%rbx,%r14
+ leaq (%r8,%r12,1),%r8
+ xorq %r14,%r13
+ movq %r9,%rdi
+ rorxq $39,%r9,%r12
+ leaq (%r8,%r13,1),%r8
+ xorq %r10,%rdi
+ rorxq $34,%r9,%r14
+ rorxq $28,%r9,%r13
+ leaq (%rax,%r8,1),%rax
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r10,%r15
+ xorq %r13,%r14
+ leaq (%r8,%r15,1),%r8
+ movq %rbx,%r12
+ addq 64+16(%rbp),%rdx
+ andq %rax,%r12
+ rorxq $41,%rax,%r13
+ rorxq $18,%rax,%r15
+ leaq (%r8,%r14,1),%r8
+ leaq (%rdx,%r12,1),%rdx
+ andnq %rcx,%rax,%r12
+ xorq %r15,%r13
+ rorxq $14,%rax,%r14
+ leaq (%rdx,%r12,1),%rdx
+ xorq %r14,%r13
+ movq %r8,%r15
+ rorxq $39,%r8,%r12
+ leaq (%rdx,%r13,1),%rdx
+ xorq %r9,%r15
+ rorxq $34,%r8,%r14
+ rorxq $28,%r8,%r13
+ leaq (%r11,%rdx,1),%r11
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %r9,%rdi
+ xorq %r13,%r14
+ leaq (%rdx,%rdi,1),%rdx
+ movq %rax,%r12
+ addq 72+16(%rbp),%rcx
+ andq %r11,%r12
+ rorxq $41,%r11,%r13
+ rorxq $18,%r11,%rdi
+ leaq (%rdx,%r14,1),%rdx
+ leaq (%rcx,%r12,1),%rcx
+ andnq %rbx,%r11,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r11,%r14
+ leaq (%rcx,%r12,1),%rcx
+ xorq %r14,%r13
+ movq %rdx,%rdi
+ rorxq $39,%rdx,%r12
+ leaq (%rcx,%r13,1),%rcx
+ xorq %r8,%rdi
+ rorxq $34,%rdx,%r14
+ rorxq $28,%rdx,%r13
+ leaq (%r10,%rcx,1),%r10
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %r8,%r15
+ xorq %r13,%r14
+ leaq (%rcx,%r15,1),%rcx
+ movq %r11,%r12
+ addq 96+16(%rbp),%rbx
+ andq %r10,%r12
+ rorxq $41,%r10,%r13
+ rorxq $18,%r10,%r15
+ leaq (%rcx,%r14,1),%rcx
+ leaq (%rbx,%r12,1),%rbx
+ andnq %rax,%r10,%r12
+ xorq %r15,%r13
+ rorxq $14,%r10,%r14
+ leaq (%rbx,%r12,1),%rbx
+ xorq %r14,%r13
+ movq %rcx,%r15
+ rorxq $39,%rcx,%r12
+ leaq (%rbx,%r13,1),%rbx
+ xorq %rdx,%r15
+ rorxq $34,%rcx,%r14
+ rorxq $28,%rcx,%r13
+ leaq (%r9,%rbx,1),%r9
+ andq %r15,%rdi
+ xorq %r12,%r14
+ xorq %rdx,%rdi
+ xorq %r13,%r14
+ leaq (%rbx,%rdi,1),%rbx
+ movq %r10,%r12
+ addq 104+16(%rbp),%rax
+ andq %r9,%r12
+ rorxq $41,%r9,%r13
+ rorxq $18,%r9,%rdi
+ leaq (%rbx,%r14,1),%rbx
+ leaq (%rax,%r12,1),%rax
+ andnq %r11,%r9,%r12
+ xorq %rdi,%r13
+ rorxq $14,%r9,%r14
+ leaq (%rax,%r12,1),%rax
+ xorq %r14,%r13
+ movq %rbx,%rdi
+ rorxq $39,%rbx,%r12
+ leaq (%rax,%r13,1),%rax
+ xorq %rcx,%rdi
+ rorxq $34,%rbx,%r14
+ rorxq $28,%rbx,%r13
+ leaq (%r8,%rax,1),%r8
+ andq %rdi,%r15
+ xorq %r12,%r14
+ xorq %rcx,%r15
+ xorq %r13,%r14
+ leaq (%rax,%r15,1),%rax
+ movq %r9,%r12
+ leaq -128(%rbp),%rbp
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 1280(%rsp),%rdi
+ addq %r14,%rax
+
+ leaq 1152(%rsp),%rsp
+
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ leaq 256(%rsi),%rsi
+ addq 48(%rdi),%r10
+ movq %rsi,%r12
+ addq 56(%rdi),%r11
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ cmoveq %rsp,%r12
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+
+.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 152(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2
diff --git a/secure/lib/libcrypto/amd64/x25519-x86_64.S b/secure/lib/libcrypto/amd64/x25519-x86_64.S
index 28063bf95b00..7448e866aaf6 100644
--- a/secure/lib/libcrypto/amd64/x25519-x86_64.S
+++ b/secure/lib/libcrypto/amd64/x25519-x86_64.S
@@ -397,32 +397,408 @@ x25519_fe51_mul121666:
.Lfe51_mul121666_epilogue:
.cfi_endproc
.size x25519_fe51_mul121666,.-x25519_fe51_mul121666
+
.globl x25519_fe64_eligible
.type x25519_fe64_eligible,@function
.align 32
x25519_fe64_eligible:
.cfi_startproc
+ movl OPENSSL_ia32cap_P+8(%rip),%ecx
xorl %eax,%eax
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ cmovel %ecx,%eax
.byte 0xf3,0xc3
.cfi_endproc
.size x25519_fe64_eligible,.-x25519_fe64_eligible
.globl x25519_fe64_mul
.type x25519_fe64_mul,@function
-.globl x25519_fe64_sqr
-.globl x25519_fe64_mul121666
-.globl x25519_fe64_add
-.globl x25519_fe64_sub
-.globl x25519_fe64_tobytes
+.align 32
x25519_fe64_mul:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ pushq %rdi
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rdi,-64
+ leaq -16(%rsp),%rsp
+.cfi_adjust_cfa_offset 16
+.Lfe64_mul_body:
+
+ movq %rdx,%rax
+ movq 0(%rdx),%rbp
+ movq 0(%rsi),%rdx
+ movq 8(%rax),%rcx
+ movq 16(%rax),%r14
+ movq 24(%rax),%r15
+
+ mulxq %rbp,%r8,%rax
+ xorl %edi,%edi
+ mulxq %rcx,%r9,%rbx
+ adcxq %rax,%r9
+ mulxq %r14,%r10,%rax
+ adcxq %rbx,%r10
+ mulxq %r15,%r11,%r12
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r11
+ movq %r14,(%rsp)
+ adcxq %rdi,%r12
+
+ mulxq %rbp,%rax,%rbx
+ adoxq %rax,%r9
+ adcxq %rbx,%r10
+ mulxq %rcx,%rax,%rbx
+ adoxq %rax,%r10
+ adcxq %rbx,%r11
+ mulxq %r14,%rax,%rbx
+ adoxq %rax,%r11
+ adcxq %rbx,%r12
+ mulxq %r15,%rax,%r13
+ movq 16(%rsi),%rdx
+ adoxq %rax,%r12
+ adcxq %rdi,%r13
+ adoxq %rdi,%r13
+
+ mulxq %rbp,%rax,%rbx
+ adcxq %rax,%r10
+ adoxq %rbx,%r11
+ mulxq %rcx,%rax,%rbx
+ adcxq %rax,%r11
+ adoxq %rbx,%r12
+ mulxq %r14,%rax,%rbx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ mulxq %r15,%rax,%r14
+ movq 24(%rsi),%rdx
+ adcxq %rax,%r13
+ adoxq %rdi,%r14
+ adcxq %rdi,%r14
+
+ mulxq %rbp,%rax,%rbx
+ adoxq %rax,%r11
+ adcxq %rbx,%r12
+ mulxq %rcx,%rax,%rbx
+ adoxq %rax,%r12
+ adcxq %rbx,%r13
+ mulxq (%rsp),%rax,%rbx
+ adoxq %rax,%r13
+ adcxq %rbx,%r14
+ mulxq %r15,%rax,%r15
+ movl $38,%edx
+ adoxq %rax,%r14
+ adcxq %rdi,%r15
+ adoxq %rdi,%r15
+
+ jmp .Lreduce64
+.Lfe64_mul_epilogue:
+.cfi_endproc
+.size x25519_fe64_mul,.-x25519_fe64_mul
+
+.globl x25519_fe64_sqr
+.type x25519_fe64_sqr,@function
+.align 32
x25519_fe64_sqr:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ pushq %rdi
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rdi,-64
+ leaq -16(%rsp),%rsp
+.cfi_adjust_cfa_offset 16
+.Lfe64_sqr_body:
+
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%rcx
+ movq 16(%rsi),%rbp
+ movq 24(%rsi),%rsi
+
+
+ mulxq %rdx,%r8,%r15
+ mulxq %rcx,%r9,%rax
+ xorl %edi,%edi
+ mulxq %rbp,%r10,%rbx
+ adcxq %rax,%r10
+ mulxq %rsi,%r11,%r12
+ movq %rcx,%rdx
+ adcxq %rbx,%r11
+ adcxq %rdi,%r12
+
+
+ mulxq %rbp,%rax,%rbx
+ adoxq %rax,%r11
+ adcxq %rbx,%r12
+ mulxq %rsi,%rax,%r13
+ movq %rbp,%rdx
+ adoxq %rax,%r12
+ adcxq %rdi,%r13
+
+
+ mulxq %rsi,%rax,%r14
+ movq %rcx,%rdx
+ adoxq %rax,%r13
+ adcxq %rdi,%r14
+ adoxq %rdi,%r14
+
+ adcxq %r9,%r9
+ adoxq %r15,%r9
+ adcxq %r10,%r10
+ mulxq %rdx,%rax,%rbx
+ movq %rbp,%rdx
+ adcxq %r11,%r11
+ adoxq %rax,%r10
+ adcxq %r12,%r12
+ adoxq %rbx,%r11
+ mulxq %rdx,%rax,%rbx
+ movq %rsi,%rdx
+ adcxq %r13,%r13
+ adoxq %rax,%r12
+ adcxq %r14,%r14
+ adoxq %rbx,%r13
+ mulxq %rdx,%rax,%r15
+ movl $38,%edx
+ adoxq %rax,%r14
+ adcxq %rdi,%r15
+ adoxq %rdi,%r15
+ jmp .Lreduce64
+
+.align 32
+.Lreduce64:
+ mulxq %r12,%rax,%rbx
+ adcxq %rax,%r8
+ adoxq %rbx,%r9
+ mulxq %r13,%rax,%rbx
+ adcxq %rax,%r9
+ adoxq %rbx,%r10
+ mulxq %r14,%rax,%rbx
+ adcxq %rax,%r10
+ adoxq %rbx,%r11
+ mulxq %r15,%rax,%r12
+ adcxq %rax,%r11
+ adoxq %rdi,%r12
+ adcxq %rdi,%r12
+
+ movq 16(%rsp),%rdi
+ imulq %rdx,%r12
+
+ addq %r12,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ addq %rax,%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r8,0(%rdi)
+
+ movq 24(%rsp),%r15
+.cfi_restore %r15
+ movq 32(%rsp),%r14
+.cfi_restore %r14
+ movq 40(%rsp),%r13
+.cfi_restore %r13
+ movq 48(%rsp),%r12
+.cfi_restore %r12
+ movq 56(%rsp),%rbx
+.cfi_restore %rbx
+ movq 64(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 72(%rsp),%rsp
+.cfi_adjust_cfa_offset 88
+.Lfe64_sqr_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size x25519_fe64_sqr,.-x25519_fe64_sqr
+
+.globl x25519_fe64_mul121666
+.type x25519_fe64_mul121666,@function
+.align 32
x25519_fe64_mul121666:
+.Lfe64_mul121666_body:
+.cfi_startproc
+ movl $121666,%edx
+ mulxq 0(%rsi),%r8,%rcx
+ mulxq 8(%rsi),%r9,%rax
+ addq %rcx,%r9
+ mulxq 16(%rsi),%r10,%rcx
+ adcq %rax,%r10
+ mulxq 24(%rsi),%r11,%rax
+ adcq %rcx,%r11
+ adcq $0,%rax
+
+ imulq $38,%rax,%rax
+
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ addq %rax,%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r8,0(%rdi)
+
+.Lfe64_mul121666_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
+
+.globl x25519_fe64_add
+.type x25519_fe64_add,@function
+.align 32
x25519_fe64_add:
+.Lfe64_add_body:
+.cfi_startproc
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+
+ addq 0(%rdx),%r8
+ adcq 8(%rdx),%r9
+ adcq 16(%rdx),%r10
+ adcq 24(%rdx),%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ addq %rax,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ movq %r9,8(%rdi)
+ adcq $0,%r11
+ movq %r10,16(%rdi)
+ sbbq %rax,%rax
+ movq %r11,24(%rdi)
+ andq $38,%rax
+
+ addq %rax,%r8
+ movq %r8,0(%rdi)
+
+.Lfe64_add_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size x25519_fe64_add,.-x25519_fe64_add
+
+.globl x25519_fe64_sub
+.type x25519_fe64_sub,@function
+.align 32
x25519_fe64_sub:
+.Lfe64_sub_body:
+.cfi_startproc
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+
+ subq 0(%rdx),%r8
+ sbbq 8(%rdx),%r9
+ sbbq 16(%rdx),%r10
+ sbbq 24(%rdx),%r11
+
+ sbbq %rax,%rax
+ andq $38,%rax
+
+ subq %rax,%r8
+ sbbq $0,%r9
+ sbbq $0,%r10
+ movq %r9,8(%rdi)
+ sbbq $0,%r11
+ movq %r10,16(%rdi)
+ sbbq %rax,%rax
+ movq %r11,24(%rdi)
+ andq $38,%rax
+
+ subq %rax,%r8
+ movq %r8,0(%rdi)
+
+.Lfe64_sub_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size x25519_fe64_sub,.-x25519_fe64_sub
+
+.globl x25519_fe64_tobytes
+.type x25519_fe64_tobytes,@function
+.align 32
x25519_fe64_tobytes:
+.Lfe64_to_body:
.cfi_startproc
-.byte 0x0f,0x0b
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+
+
+ leaq (%r11,%r11,1),%rax
+ sarq $63,%r11
+ shrq $1,%rax
+ andq $19,%r11
+ addq $19,%r11
+
+ addq %r11,%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%rax
+
+ leaq (%rax,%rax,1),%r11
+ sarq $63,%rax
+ shrq $1,%r11
+ notq %rax
+ andq $19,%rax
+
+ subq %rax,%r8
+ sbbq $0,%r9
+ sbbq $0,%r10
+ sbbq $0,%r11
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+.Lfe64_to_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
-.size x25519_fe64_mul,.-x25519_fe64_mul
+.size x25519_fe64_tobytes,.-x25519_fe64_tobytes
.byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont.S b/secure/lib/libcrypto/amd64/x86_64-mont.S
index 2fd4d2f46006..015a87c446b7 100644
--- a/secure/lib/libcrypto/amd64/x86_64-mont.S
+++ b/secure/lib/libcrypto/amd64/x86_64-mont.S
@@ -16,6 +16,7 @@ bn_mul_mont:
jnz .Lmul_enter
cmpl $8,%r9d
jb .Lmul_enter
+ movl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpq %rsi,%rdx
jne .Lmul4x_enter
testl $7,%r9d
@@ -264,6 +265,9 @@ bn_mul4x_mont:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
+ andl $0x80100,%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -689,6 +693,7 @@ bn_mul4x_mont:
.size bn_mul4x_mont,.-bn_mul4x_mont
+
.type bn_sqr8x_mont,@function
.align 32
bn_sqr8x_mont:
@@ -770,6 +775,25 @@ bn_sqr8x_mont:
pxor %xmm0,%xmm0
.byte 102,72,15,110,207
.byte 102,73,15,110,218
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $0x80100,%eax
+ cmpl $0x80100,%eax
+ jne .Lsqr8x_nox
+
+ call bn_sqrx8x_internal
+
+
+
+
+ leaq (%r8,%rcx,1),%rbx
+ movq %rcx,%r9
+ movq %rcx,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_nox:
call bn_sqr8x_internal
@@ -857,5 +881,361 @@ bn_sqr8x_mont:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_sqr8x_mont,.-bn_sqr8x_mont
+.type bn_mulx4x_mont,@function
+.align 32
+bn_mulx4x_mont:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,1),%rbp
+ andq $-128,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+ leaq (%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r9,0(%rsp)
+ shrq $5,%r9
+ movq %r10,16(%rsp)
+ subq $1,%r9
+ movq %r8,24(%rsp)
+ movq %rdi,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+ movq %r9,48(%rsp)
+ jmp .Lmulx4x_body
+
+.align 32
+.Lmulx4x_body:
+ leaq 8(%rdx),%rdi
+ movq (%rdx),%rdx
+ leaq 64+32(%rsp),%rbx
+ movq %rdx,%r9
+
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r14
+ addq %rax,%r11
+ movq %rdi,8(%rsp)
+ mulxq 16(%rsi),%r12,%r13
+ adcq %r14,%r12
+ adcq $0,%r13
+
+ movq %r8,%rdi
+ imulq 24(%rsp),%r8
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%rdi
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+ movq 48(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ addq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ movq (%rdi),%rdx
+ leaq 8(%rdi),%rdi
+ subq %rax,%rsi
+ movq %r15,(%rbx)
+ leaq 64+32(%rsp),%rbx
+ subq %rax,%rcx
+
+ mulxq 0(%rsi),%r8,%r11
+ xorl %ebp,%ebp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ adoxq -16(%rbx),%r12
+ adcxq %rbp,%r13
+ adoxq %rbp,%r13
+
+ movq %rdi,8(%rsp)
+ movq %r8,%r15
+ imulq 24(%rsp),%r8
+ xorl %ebp,%ebp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ adoxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ leaq 32(%rcx),%rcx
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ movq 48(%rsp),%rdi
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-32(%rbx)
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ subq 0(%rbx),%rbp
+ adcq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+
+ cmpq 16(%rsp),%rdi
+ jne .Lmulx4x_outer
+
+ leaq 64(%rsp),%rbx
+ subq %rax,%rcx
+ negq %r15
+ movq %rax,%rdx
+ shrq $3+2,%rax
+ movq 32(%rsp),%rdi
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ movq 0(%rbx),%r11
+ movq 8(%rbx),%r12
+ movq 16(%rbx),%r13
+ movq 24(%rbx),%r14
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rcx),%r11
+ sbbq 8(%rcx),%r12
+ sbbq 16(%rcx),%r13
+ sbbq 24(%rcx),%r14
+ leaq 32(%rcx),%rcx
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+ movq %r13,16(%rdi)
+ movq %r14,24(%rdi)
+ leaq 32(%rdi),%rdi
+ decq %rax
+ jnz .Lmulx4x_sub
+
+ sbbq $0,%r15
+ leaq 64(%rsp),%rbx
+ subq %rdx,%rdi
+
+.byte 102,73,15,110,207
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ jmp .Lmulx4x_cond_copy
+
+.align 32
+.Lmulx4x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ subq $32,%rdx
+ jnz .Lmulx4x_cond_copy
+
+ movq %rdx,(%rbx)
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mulx4x_mont,.-bn_mulx4x_mont
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
diff --git a/secure/lib/libcrypto/amd64/x86_64-mont5.S b/secure/lib/libcrypto/amd64/x86_64-mont5.S
index b69366fa905d..cb2528c08dd2 100644
--- a/secure/lib/libcrypto/amd64/x86_64-mont5.S
+++ b/secure/lib/libcrypto/amd64/x86_64-mont5.S
@@ -14,6 +14,7 @@ bn_mul_mont_gather5:
.cfi_def_cfa_register %rax
testl $7,%r9d
jnz .Lmul_enter
+ movl OPENSSL_ia32cap_P+8(%rip),%r11d
jmp .Lmul4x_enter
.align 16
@@ -450,6 +451,9 @@ bn_mul4x_mont_gather5:
movq %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lmulx4x_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -1079,6 +1083,10 @@ bn_power5:
.cfi_startproc
movq %rsp,%rax
.cfi_def_cfa_register %rax
+ movl OPENSSL_ia32cap_P+8(%rip),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lpowerx5_enter
pushq %rbx
.cfi_offset %rbx,-16
pushq %rbp
@@ -2168,6 +2176,21 @@ bn_from_mont8x:
.byte 0x67
movq %rcx,%rbp
.byte 102,73,15,110,218
+ movl OPENSSL_ia32cap_P+8(%rip),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ jne .Lfrom_mont_nox
+
+ leaq (%rax,%r9,1),%rdi
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_nox:
call __bn_sqr8x_reduction
call __bn_post4x_internal
@@ -2206,6 +2229,1348 @@ bn_from_mont8x:
.byte 0xf3,0xc3
.cfi_endproc
.size bn_from_mont8x,.-bn_from_mont8x
+.type bn_mulx4x_mont_gather5,@function
+.align 32
+bn_mulx4x_mont_gather5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmulx4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lmulx4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type mulx4x_internal,@function
+.align 32
+mulx4x_internal:
+.cfi_startproc
+ movq %r9,8(%rsp)
+ movq %r9,%r10
+ negq %r9
+ shlq $5,%r9
+ negq %r10
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5+5,%r9
+ movd 8(%rax),%xmm5
+ subq $1,%r9
+ leaq .Linc(%rip),%rax
+ movq %r13,16+8(%rsp)
+ movq %r9,24+8(%rsp)
+ movq %rdi,56+8(%rsp)
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r10,1),%r10
+ leaq 128(%rdx),%rdi
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67
+ movdqa %xmm1,%xmm2
+.byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+.byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+
+ pand 64(%rdi),%xmm0
+ pand 80(%rdi),%xmm1
+ pand 96(%rdi),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%rdi),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%rdi),%xmm4
+ movdqa -112(%rdi),%xmm5
+ movdqa -96(%rdi),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%rdi),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%rdi),%xmm4
+ movdqa -48(%rdi),%xmm5
+ movdqa -32(%rdi),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%rdi),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%rdi),%xmm4
+ movdqa 16(%rdi),%xmm5
+ movdqa 32(%rdi),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%rdi),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ pxor %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+ leaq 64+32+8(%rsp),%rbx
+
+ movq %rdx,%r9
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r12
+ addq %rax,%r11
+ mulxq 16(%rsi),%rax,%r13
+ adcq %rax,%r12
+ adcq $0,%r13
+ mulxq 24(%rsi),%rax,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+ xorq %rbp,%rbp
+ movq %r8,%rdx
+
+ movq %rdi,8+8(%rsp)
+
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 8(%rsp),%rax
+ adcq %rbp,%r15
+ leaq (%rsi,%rax,1),%rsi
+ addq %r15,%r14
+ movq 8+8(%rsp),%rdi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ leaq 16-256(%rbx),%r10
+ pxor %xmm4,%xmm4
+.byte 0x67,0x67
+ pxor %xmm5,%xmm5
+ movdqa -128(%rdi),%xmm0
+ movdqa -112(%rdi),%xmm1
+ movdqa -96(%rdi),%xmm2
+ pand 256(%r10),%xmm0
+ movdqa -80(%rdi),%xmm3
+ pand 272(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 288(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 304(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%rdi),%xmm0
+ movdqa -48(%rdi),%xmm1
+ movdqa -32(%rdi),%xmm2
+ pand 320(%r10),%xmm0
+ movdqa -16(%rdi),%xmm3
+ pand 336(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 352(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 368(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%rdi),%xmm0
+ movdqa 16(%rdi),%xmm1
+ movdqa 32(%rdi),%xmm2
+ pand 384(%r10),%xmm0
+ movdqa 48(%rdi),%xmm3
+ pand 400(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 416(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 432(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%rdi),%xmm0
+ movdqa 80(%rdi),%xmm1
+ movdqa 96(%rdi),%xmm2
+ pand 448(%r10),%xmm0
+ movdqa 112(%rdi),%xmm3
+ pand 464(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 480(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 496(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+
+ movq %rbp,(%rbx)
+ leaq 32(%rbx,%rax,1),%rbx
+ mulxq 0(%rsi),%r8,%r11
+ xorq %rbp,%rbp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ mulxq 24(%rsi),%rdx,%r14
+ adoxq -16(%rbx),%r12
+ adcxq %rdx,%r13
+ leaq (%rcx,%rax,1),%rcx
+ leaq 32(%rsi),%rsi
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ adoxq %rbp,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+
+ movq %r8,%rdx
+ xorq %rbp,%rbp
+ movq %rdi,8+8(%rsp)
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-24(%rbx)
+ adoxq %rbp,%r15
+ movq %r12,-16(%rbx)
+ leaq 32(%rcx),%rcx
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ movq %r11,-32(%rbx)
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ leaq 32(%rcx),%rcx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0+8(%rsp),%rax
+ adcq %rbp,%r15
+ subq 0(%rbx),%rdi
+ movq 8+8(%rsp),%rdi
+ movq 16+8(%rsp),%r10
+ adcq %r15,%r14
+ leaq (%rsi,%rax,1),%rsi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+
+ cmpq %r10,%rdi
+ jb .Lmulx4x_outer
+
+ movq -8(%rcx),%r10
+ movq %rbp,%r8
+ movq (%rcx,%rax,1),%r12
+ leaq (%rcx,%rax,1),%rbp
+ movq %rax,%rcx
+ leaq (%rbx,%rax,1),%rdi
+ xorl %eax,%eax
+ xorq %r15,%r15
+ subq %r14,%r10
+ adcq %r15,%r15
+ orq %r15,%r8
+ sarq $3+2,%rcx
+ subq %r8,%rax
+ movq 56+8(%rsp),%rdx
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+.cfi_endproc
+.size mulx4x_internal,.-mulx4x_internal
+.type bn_powerx5,@function
+.align 32
+bn_powerx5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lpowerx5_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lpowerx5_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwrx_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lpwrx_sp_done
+
+.align 32
+.Lpwrx_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lpwrx_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+
+ movq %r10,%r9
+ movq %rsi,%rdi
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpowerx5_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_powerx5,.-bn_powerx5
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,@function
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 48+8(%rsp),%rdi
+ leaq (%rsi,%r9,1),%rbp
+ movq %r9,0+8(%rsp)
+ movq %rbp,8+8(%rsp)
+ jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte 0x3e
+ movdqa %xmm0,0(%rdi)
+ movdqa %xmm0,16(%rdi)
+ movdqa %xmm0,32(%rdi)
+ movdqa %xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+ movdqa %xmm0,64(%rdi)
+ movdqa %xmm0,80(%rdi)
+ movdqa %xmm0,96(%rdi)
+ movdqa %xmm0,112(%rdi)
+ leaq 128(%rdi),%rdi
+ subq $64,%r9
+ jnz .Lsqrx8x_zero
+
+ movq 0(%rsi),%rdx
+
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ leaq 48+8(%rsp),%rdi
+ xorq %rbp,%rbp
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+ mulxq 8(%rsi),%r8,%rax
+ adcxq %r9,%r8
+ adoxq %rax,%r10
+ mulxq 16(%rsi),%r9,%rax
+ adcxq %r10,%r9
+ adoxq %rax,%r11
+.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+ adcxq %r11,%r10
+ adoxq %rax,%r12
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+ adcxq %r12,%r11
+ adoxq %rax,%r13
+ mulxq 40(%rsi),%r12,%rax
+ adcxq %r13,%r12
+ adoxq %rax,%r14
+ mulxq 48(%rsi),%r13,%rax
+ adcxq %r14,%r13
+ adoxq %r15,%rax
+ mulxq 56(%rsi),%r14,%r15
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r14
+ adoxq %rbp,%r15
+ adcq 64(%rdi),%r15
+ movq %r8,8(%rdi)
+ movq %r9,16(%rdi)
+ sbbq %rcx,%rcx
+ xorq %rbp,%rbp
+
+
+ mulxq 16(%rsi),%r8,%rbx
+ mulxq 24(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 32(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %rbx,%r11
+.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adcxq %r13,%r11
+ adoxq %r14,%r12
+.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+ movq 16(%rsi),%rdx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ adcxq %r15,%r13
+ adoxq %rbp,%r14
+ adcxq %rbp,%r14
+
+ movq %r8,24(%rdi)
+ movq %r9,32(%rdi)
+
+ mulxq 24(%rsi),%r8,%rbx
+ mulxq 32(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 40(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %r13,%r11
+.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte 0x3e
+ movq 24(%rsi),%rdx
+ adcxq %rbx,%r11
+ adoxq %rax,%r12
+ adcxq %r14,%r12
+ movq %r8,40(%rdi)
+ movq %r9,48(%rdi)
+ mulxq 32(%rsi),%r8,%rax
+ adoxq %rbp,%r13
+ adcxq %rbp,%r13
+
+ mulxq 40(%rsi),%r9,%rbx
+ adcxq %r10,%r8
+ adoxq %rax,%r9
+ mulxq 48(%rsi),%r10,%rax
+ adcxq %r11,%r9
+ adoxq %r12,%r10
+ mulxq 56(%rsi),%r11,%r12
+ movq 32(%rsi),%rdx
+ movq 40(%rsi),%r14
+ adcxq %rbx,%r10
+ adoxq %rax,%r11
+ movq 48(%rsi),%r15
+ adcxq %r13,%r11
+ adoxq %rbp,%r12
+ adcxq %rbp,%r12
+
+ movq %r8,56(%rdi)
+ movq %r9,64(%rdi)
+
+ mulxq %r14,%r9,%rax
+ movq 56(%rsi),%r8
+ adcxq %r10,%r9
+ mulxq %r15,%r10,%rbx
+ adoxq %rax,%r10
+ adcxq %r11,%r10
+ mulxq %r8,%r11,%rax
+ movq %r14,%rdx
+ adoxq %rbx,%r11
+ adcxq %r12,%r11
+
+ adcxq %rbp,%rax
+
+ mulxq %r15,%r14,%rbx
+ mulxq %r8,%r12,%r13
+ movq %r15,%rdx
+ leaq 64(%rsi),%rsi
+ adcxq %r14,%r11
+ adoxq %rbx,%r12
+ adcxq %rax,%r12
+ adoxq %rbp,%r13
+
+.byte 0x67,0x67
+ mulxq %r8,%r8,%r14
+ adcxq %r8,%r13
+ adcxq %rbp,%r14
+
+ cmpq 8+8(%rsp),%rsi
+ je .Lsqrx8x_outer_break
+
+ negq %rcx
+ movq $-8,%rcx
+ movq %rbp,%r15
+ movq 64(%rdi),%r8
+ adcxq 72(%rdi),%r9
+ adcxq 80(%rdi),%r10
+ adcxq 88(%rdi),%r11
+ adcq 96(%rdi),%r12
+ adcq 104(%rdi),%r13
+ adcq 112(%rdi),%r14
+ adcq 120(%rdi),%r15
+ leaq (%rsi),%rbp
+ leaq 128(%rdi),%rdi
+ sbbq %rax,%rax
+
+ movq -64(%rsi),%rdx
+ movq %rax,16+8(%rsp)
+ movq %rdi,24+8(%rsp)
+
+
+ xorl %eax,%eax
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ movq %rbx,(%rdi,%rcx,8)
+ movl $0,%ebx
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+ movq 8(%rsi,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rbx,%r15
+ adcxq %rbx,%r15
+
+.byte 0x67
+ incq %rcx
+ jnz .Lsqrx8x_loop
+
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ cmpq 8+8(%rsp),%rbp
+ je .Lsqrx8x_break
+
+ subq 16+8(%rsp),%rbx
+.byte 0x66
+ movq -64(%rsi),%rdx
+ adcxq 0(%rdi),%r8
+ adcxq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+.byte 0x67
+ sbbq %rax,%rax
+ xorl %ebx,%ebx
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+ xorq %rbp,%rbp
+ subq 16+8(%rsp),%rbx
+ adcxq %rbp,%r8
+ movq 24+8(%rsp),%rcx
+ adcxq %rbp,%r9
+ movq 0(%rsi),%rdx
+ adcq $0,%r10
+ movq %r8,0(%rdi)
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ cmpq %rcx,%rdi
+ je .Lsqrx8x_outer_loop
+
+ movq %r9,8(%rdi)
+ movq 8(%rcx),%r9
+ movq %r10,16(%rdi)
+ movq 16(%rcx),%r10
+ movq %r11,24(%rdi)
+ movq 24(%rcx),%r11
+ movq %r12,32(%rdi)
+ movq 32(%rcx),%r12
+ movq %r13,40(%rdi)
+ movq 40(%rcx),%r13
+ movq %r14,48(%rdi)
+ movq 48(%rcx),%r14
+ movq %r15,56(%rdi)
+ movq 56(%rcx),%r15
+ movq %rcx,%rdi
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+ movq %r9,72(%rdi)
+.byte 102,72,15,126,217
+ movq %r10,80(%rdi)
+ movq %r11,88(%rdi)
+ movq %r12,96(%rdi)
+ movq %r13,104(%rdi)
+ movq %r14,112(%rdi)
+ leaq 48+8(%rsp),%rdi
+ movq (%rsi,%rcx,1),%rdx
+
+ movq 8(%rdi),%r11
+ xorq %r10,%r10
+ movq 0+8(%rsp),%r9
+ adoxq %r11,%r11
+ movq 16(%rdi),%r12
+ movq 24(%rdi),%r13
+
+
+.align 32
+.Lsqrx4x_shift_n_add:
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 40(%rdi),%r11
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ movq 16(%rsi,%rcx,1),%rdx
+ movq 48(%rdi),%r12
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 56(%rdi),%r13
+ movq %rax,16(%rdi)
+ movq %rbx,24(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+ movq 24(%rsi,%rcx,1),%rdx
+ leaq 32(%rcx),%rcx
+ movq 64(%rdi),%r10
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 72(%rdi),%r11
+ movq %rax,32(%rdi)
+ movq %rbx,40(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ jrcxz .Lsqrx4x_shift_n_add_break
+.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 80(%rdi),%r12
+ movq 88(%rdi),%r13
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+ nop
+ jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+ adcxq %r13,%rbx
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+.byte 102,72,15,126,213
+__bn_sqrx8x_reduction:
+ xorl %eax,%eax
+ movq 32+8(%rsp),%rbx
+ movq 48+8(%rsp),%rdx
+ leaq -64(%rbp,%r9,1),%rcx
+
+ movq %rcx,0+8(%rsp)
+ movq %rdi,8+8(%rsp)
+
+ leaq 48+8(%rsp),%rdi
+ jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq %rdx,%r8
+ imulq %rbx,%rdx
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,24+8(%rsp)
+
+ leaq 64(%rdi),%rdi
+ xorq %rsi,%rsi
+ movq $-8,%rcx
+ jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rbx,%r9
+ adcxq %rbx,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 32+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+ movq %rax,64+48+8(%rsp,%rcx,8)
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+.byte 0x67,0x67,0x67
+ incq %rcx
+ jnz .Lsqrx8x_reduce
+
+ movq %rsi,%rax
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_no_tail
+
+ movq 48+8(%rsp),%rdx
+ addq 0(%rdi),%r8
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ adcxq 8(%rdi),%r9
+ adcxq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq 72+48+8(%rsp,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ movq %rbx,(%rdi,%rcx,8)
+ movq %r8,%rbx
+ adcxq %rsi,%r15
+
+ incq %rcx
+ jnz .Lsqrx8x_tail
+
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_tail_done
+
+ subq 16+8(%rsp),%rsi
+ movq 48+8(%rsp),%rdx
+ leaq 64(%rbp),%rbp
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+ subq $8,%rcx
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+ xorq %rax,%rax
+ addq 24+8(%rsp),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ subq 16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+ adcq 0(%rdi),%r8
+.byte 102,72,15,126,217
+ adcq 8(%rdi),%r9
+ movq 56(%rbp),%rsi
+.byte 102,72,15,126,213
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+
+ movq 32+8(%rsp),%rbx
+ movq 64(%rdi,%rcx,1),%rdx
+
+ movq %r8,0(%rdi)
+ leaq 64(%rdi),%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 64(%rdi,%rcx,1),%rdi
+ cmpq 8+8(%rsp),%r8
+ jb .Lsqrx8x_reduction_loop
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align 32
+__bn_postx4x_internal:
+.cfi_startproc
+ movq 0(%rbp),%r12
+ movq %rcx,%r10
+ movq %rcx,%r9
+ negq %rax
+ sarq $3+2,%rcx
+
+.byte 102,72,15,126,202
+.byte 102,72,15,126,206
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+
+.align 16
+.Lsqrx4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+ andnq %rax,%r12,%r12
+ leaq 32(%rbp),%rbp
+ andnq %rax,%r13,%r13
+ andnq %rax,%r14,%r14
+ andnq %rax,%r15,%r15
+
+ negq %r8
+ adcq 0(%rdi),%r12
+ adcq 8(%rdi),%r13
+ adcq 16(%rdi),%r14
+ adcq 24(%rdi),%r15
+ movq %r12,0(%rdx)
+ leaq 32(%rdi),%rdi
+ movq %r13,8(%rdx)
+ sbbq %r8,%r8
+ movq %r14,16(%rdx)
+ movq %r15,24(%rdx)
+ leaq 32(%rdx),%rdx
+
+ incq %rcx
+ jnz .Lsqrx4x_sub
+
+ negq %r9
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
.globl bn_get_bits5
.type bn_get_bits5,@function
.align 16
diff --git a/secure/lib/libcrypto/i386/chacha-x86.S b/secure/lib/libcrypto/i386/chacha-x86.S
index 566285310e06..d6b2936a5381 100644
--- a/secure/lib/libcrypto/i386/chacha-x86.S
+++ b/secure/lib/libcrypto/i386/chacha-x86.S
@@ -385,6 +385,8 @@ ChaCha20_ssse3:
pushl %esi
pushl %edi
.Lssse3_shortcut:
+ testl $2048,4(%ebp)
+ jnz .Lxop_shortcut
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
@@ -528,6 +530,484 @@ ChaCha20_ssse3:
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
+.globl ChaCha20_xop
+.type ChaCha20_xop,@function
+.align 16
+ChaCha20_xop:
+.L_ChaCha20_xop_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lxop_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ vzeroupper
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ vmovdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0141x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ vmovdqu (%edx),%xmm7
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpaddd 48(%eax),%xmm0,%xmm0
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpsubd 64(%eax),%xmm0,%xmm0
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%ebp)
+ vmovdqa %xmm1,80(%ebp)
+ vmovdqa %xmm2,96(%ebp)
+ vmovdqa %xmm3,112(%ebp)
+ vmovdqu 16(%edx),%xmm3
+ vmovdqa %xmm4,-64(%ebp)
+ vmovdqa %xmm5,-48(%ebp)
+ vmovdqa %xmm6,-32(%ebp)
+ vmovdqa %xmm7,-16(%ebp)
+ vmovdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,(%ebp)
+ vmovdqa %xmm1,16(%ebp)
+ vmovdqa %xmm2,32(%ebp)
+ vmovdqa %xmm3,48(%ebp)
+ vmovdqa %xmm4,-128(%ebp)
+ vmovdqa %xmm5,-112(%ebp)
+ vmovdqa %xmm6,-96(%ebp)
+ vmovdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L015outer_loop
+.align 32
+.L015outer_loop:
+ vmovdqa -112(%ebp),%xmm1
+ vmovdqa -96(%ebp),%xmm2
+ vmovdqa -80(%ebp),%xmm3
+ vmovdqa -48(%ebp),%xmm5
+ vmovdqa -32(%ebp),%xmm6
+ vmovdqa -16(%ebp),%xmm7
+ vmovdqa %xmm1,-112(%ebx)
+ vmovdqa %xmm2,-96(%ebx)
+ vmovdqa %xmm3,-80(%ebx)
+ vmovdqa %xmm5,-48(%ebx)
+ vmovdqa %xmm6,-32(%ebx)
+ vmovdqa %xmm7,-16(%ebx)
+ vmovdqa 32(%ebp),%xmm2
+ vmovdqa 48(%ebp),%xmm3
+ vmovdqa 64(%ebp),%xmm4
+ vmovdqa 80(%ebp),%xmm5
+ vmovdqa 96(%ebp),%xmm6
+ vmovdqa 112(%ebp),%xmm7
+ vpaddd 64(%eax),%xmm4,%xmm4
+ vmovdqa %xmm2,32(%ebx)
+ vmovdqa %xmm3,48(%ebx)
+ vmovdqa %xmm4,64(%ebx)
+ vmovdqa %xmm5,80(%ebx)
+ vmovdqa %xmm6,96(%ebx)
+ vmovdqa %xmm7,112(%ebx)
+ vmovdqa %xmm4,64(%ebp)
+ vmovdqa -128(%ebp),%xmm0
+ vmovdqa %xmm4,%xmm6
+ vmovdqa -64(%ebp),%xmm3
+ vmovdqa (%ebp),%xmm4
+ vmovdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 32
+.L016loop:
+ vpaddd %xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,246,16
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -48(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 80(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,64(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-64(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa 32(%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -32(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 96(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,80(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,16(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-48(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 48(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -16(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 112(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,96(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-32(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -48(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm7,%xmm6
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-16(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -32(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 64(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,112(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,32(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-48(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa (%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -16(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 80(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,64(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,48(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-32(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 16(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -64(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 96(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,80(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-16(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,96(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+.byte 143,232,120,194,219,7
+ decl %edx
+ jnz .L016loop
+ vmovdqa %xmm3,-64(%ebx)
+ vmovdqa %xmm4,(%ebx)
+ vmovdqa %xmm5,16(%ebx)
+ vmovdqa %xmm6,64(%ebx)
+ vmovdqa %xmm7,96(%ebx)
+ vmovdqa -112(%ebx),%xmm1
+ vmovdqa -96(%ebx),%xmm2
+ vmovdqa -80(%ebx),%xmm3
+ vpaddd -128(%ebp),%xmm0,%xmm0
+ vpaddd -112(%ebp),%xmm1,%xmm1
+ vpaddd -96(%ebp),%xmm2,%xmm2
+ vpaddd -80(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa -64(%ebx),%xmm0
+ vmovdqa -48(%ebx),%xmm1
+ vmovdqa -32(%ebx),%xmm2
+ vmovdqa -16(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd -64(%ebp),%xmm0,%xmm0
+ vpaddd -48(%ebp),%xmm1,%xmm1
+ vpaddd -32(%ebp),%xmm2,%xmm2
+ vpaddd -16(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa (%ebx),%xmm0
+ vmovdqa 16(%ebx),%xmm1
+ vmovdqa 32(%ebx),%xmm2
+ vmovdqa 48(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd (%ebp),%xmm0,%xmm0
+ vpaddd 16(%ebp),%xmm1,%xmm1
+ vpaddd 32(%ebp),%xmm2,%xmm2
+ vpaddd 48(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa 64(%ebx),%xmm0
+ vmovdqa 80(%ebx),%xmm1
+ vmovdqa 96(%ebx),%xmm2
+ vmovdqa 112(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd 64(%ebp),%xmm0,%xmm0
+ vpaddd 80(%ebp),%xmm1,%xmm1
+ vpaddd 96(%ebp),%xmm2,%xmm2
+ vpaddd 112(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 208(%esi),%esi
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L015outer_loop
+ addl $256,%ecx
+ jz .L017done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ vmovd 64(%ebp),%xmm2
+ vmovdqu (%ebx),%xmm3
+ vpaddd 96(%eax),%xmm2,%xmm2
+ vpand 112(%eax),%xmm3,%xmm3
+ vpor %xmm2,%xmm3,%xmm3
+.L0141x:
+ vmovdqa 32(%eax),%xmm0
+ vmovdqu (%edx),%xmm1
+ vmovdqu 16(%edx),%xmm2
+ vmovdqa (%eax),%xmm6
+ vmovdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L018loop1x
+.align 16
+.L019outer1x:
+ vmovdqa 80(%eax),%xmm3
+ vmovdqa (%esp),%xmm0
+ vmovdqa 16(%esp),%xmm1
+ vmovdqa 32(%esp),%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ movl $10,%edx
+ vmovdqa %xmm3,48(%esp)
+ jmp .L018loop1x
+.align 16
+.L018loop1x:
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $57,%xmm1,%xmm1
+ vpshufd $147,%xmm3,%xmm3
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $147,%xmm1,%xmm1
+ vpshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L018loop1x
+ vpaddd (%esp),%xmm0,%xmm0
+ vpaddd 16(%esp),%xmm1,%xmm1
+ vpaddd 32(%esp),%xmm2,%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ cmpl $64,%ecx
+ jb .L020tail
+ vpxor (%esi),%xmm0,%xmm0
+ vpxor 16(%esi),%xmm1,%xmm1
+ vpxor 32(%esi),%xmm2,%xmm2
+ vpxor 48(%esi),%xmm3,%xmm3
+ leal 64(%esi),%esi
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L019outer1x
+ jmp .L017done
+.L020tail:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L021tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L021tail_loop
+.L017done:
+ vzeroupper
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
.comm OPENSSL_ia32cap_P,16,4
#else
.text
@@ -914,6 +1394,8 @@ ChaCha20_ssse3:
pushl %esi
pushl %edi
.Lssse3_shortcut:
+ testl $2048,4(%ebp)
+ jnz .Lxop_shortcut
movl 20(%esp),%edi
movl 24(%esp),%esi
movl 28(%esp),%ecx
@@ -1057,5 +1539,483 @@ ChaCha20_ssse3:
.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
.byte 114,103,62,0
+.globl ChaCha20_xop
+.type ChaCha20_xop,@function
+.align 16
+ChaCha20_xop:
+.L_ChaCha20_xop_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lxop_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ vzeroupper
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ vmovdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0141x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ vmovdqu (%edx),%xmm7
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpaddd 48(%eax),%xmm0,%xmm0
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpsubd 64(%eax),%xmm0,%xmm0
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%ebp)
+ vmovdqa %xmm1,80(%ebp)
+ vmovdqa %xmm2,96(%ebp)
+ vmovdqa %xmm3,112(%ebp)
+ vmovdqu 16(%edx),%xmm3
+ vmovdqa %xmm4,-64(%ebp)
+ vmovdqa %xmm5,-48(%ebp)
+ vmovdqa %xmm6,-32(%ebp)
+ vmovdqa %xmm7,-16(%ebp)
+ vmovdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ vpshufd $0,%xmm3,%xmm0
+ vpshufd $85,%xmm3,%xmm1
+ vpshufd $170,%xmm3,%xmm2
+ vpshufd $255,%xmm3,%xmm3
+ vpshufd $0,%xmm7,%xmm4
+ vpshufd $85,%xmm7,%xmm5
+ vpshufd $170,%xmm7,%xmm6
+ vpshufd $255,%xmm7,%xmm7
+ vmovdqa %xmm0,(%ebp)
+ vmovdqa %xmm1,16(%ebp)
+ vmovdqa %xmm2,32(%ebp)
+ vmovdqa %xmm3,48(%ebp)
+ vmovdqa %xmm4,-128(%ebp)
+ vmovdqa %xmm5,-112(%ebp)
+ vmovdqa %xmm6,-96(%ebp)
+ vmovdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L015outer_loop
+.align 32
+.L015outer_loop:
+ vmovdqa -112(%ebp),%xmm1
+ vmovdqa -96(%ebp),%xmm2
+ vmovdqa -80(%ebp),%xmm3
+ vmovdqa -48(%ebp),%xmm5
+ vmovdqa -32(%ebp),%xmm6
+ vmovdqa -16(%ebp),%xmm7
+ vmovdqa %xmm1,-112(%ebx)
+ vmovdqa %xmm2,-96(%ebx)
+ vmovdqa %xmm3,-80(%ebx)
+ vmovdqa %xmm5,-48(%ebx)
+ vmovdqa %xmm6,-32(%ebx)
+ vmovdqa %xmm7,-16(%ebx)
+ vmovdqa 32(%ebp),%xmm2
+ vmovdqa 48(%ebp),%xmm3
+ vmovdqa 64(%ebp),%xmm4
+ vmovdqa 80(%ebp),%xmm5
+ vmovdqa 96(%ebp),%xmm6
+ vmovdqa 112(%ebp),%xmm7
+ vpaddd 64(%eax),%xmm4,%xmm4
+ vmovdqa %xmm2,32(%ebx)
+ vmovdqa %xmm3,48(%ebx)
+ vmovdqa %xmm4,64(%ebx)
+ vmovdqa %xmm5,80(%ebx)
+ vmovdqa %xmm6,96(%ebx)
+ vmovdqa %xmm7,112(%ebx)
+ vmovdqa %xmm4,64(%ebp)
+ vmovdqa -128(%ebp),%xmm0
+ vmovdqa %xmm4,%xmm6
+ vmovdqa -64(%ebp),%xmm3
+ vmovdqa (%ebp),%xmm4
+ vmovdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 32
+.L016loop:
+ vpaddd %xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,246,16
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -48(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 80(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,64(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-64(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa 32(%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -32(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 96(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,80(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,16(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-48(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 48(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -16(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 112(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,96(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-32(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -48(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm7,%xmm6
+.byte 143,232,120,194,219,7
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-16(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -112(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -32(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 64(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-128(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,112(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+ vmovdqa %xmm4,32(%ebx)
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-48(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa (%ebx),%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -96(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vmovdqa -16(%ebx),%xmm2
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 80(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpaddd %xmm2,%xmm0,%xmm0
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-112(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,64(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+ vpxor %xmm0,%xmm6,%xmm6
+.byte 143,232,120,194,219,7
+ vmovdqa %xmm5,48(%ebx)
+.byte 143,232,120,194,246,16
+ vmovdqa %xmm3,-32(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa 16(%ebx),%xmm5
+ vpxor %xmm4,%xmm2,%xmm2
+ vmovdqa -80(%ebx),%xmm1
+.byte 143,232,120,194,210,12
+ vmovdqa -64(%ebx),%xmm3
+ vpaddd %xmm2,%xmm0,%xmm0
+ vmovdqa 96(%ebx),%xmm7
+ vpxor %xmm0,%xmm6,%xmm6
+ vpaddd %xmm3,%xmm1,%xmm1
+.byte 143,232,120,194,246,8
+ vmovdqa %xmm0,-96(%ebx)
+ vpaddd %xmm6,%xmm4,%xmm4
+ vmovdqa %xmm6,80(%ebx)
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,210,7
+.byte 143,232,120,194,255,16
+ vmovdqa %xmm2,-16(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqa -128(%ebx),%xmm0
+.byte 143,232,120,194,219,12
+ vpaddd %xmm3,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+.byte 143,232,120,194,255,8
+ vmovdqa %xmm1,-80(%ebx)
+ vpaddd %xmm7,%xmm5,%xmm5
+ vmovdqa %xmm7,96(%ebx)
+ vpxor %xmm5,%xmm3,%xmm3
+.byte 143,232,120,194,219,7
+ decl %edx
+ jnz .L016loop
+ vmovdqa %xmm3,-64(%ebx)
+ vmovdqa %xmm4,(%ebx)
+ vmovdqa %xmm5,16(%ebx)
+ vmovdqa %xmm6,64(%ebx)
+ vmovdqa %xmm7,96(%ebx)
+ vmovdqa -112(%ebx),%xmm1
+ vmovdqa -96(%ebx),%xmm2
+ vmovdqa -80(%ebx),%xmm3
+ vpaddd -128(%ebp),%xmm0,%xmm0
+ vpaddd -112(%ebp),%xmm1,%xmm1
+ vpaddd -96(%ebp),%xmm2,%xmm2
+ vpaddd -80(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa -64(%ebx),%xmm0
+ vmovdqa -48(%ebx),%xmm1
+ vmovdqa -32(%ebx),%xmm2
+ vmovdqa -16(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd -64(%ebp),%xmm0,%xmm0
+ vpaddd -48(%ebp),%xmm1,%xmm1
+ vpaddd -32(%ebp),%xmm2,%xmm2
+ vpaddd -16(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa (%ebx),%xmm0
+ vmovdqa 16(%ebx),%xmm1
+ vmovdqa 32(%ebx),%xmm2
+ vmovdqa 48(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd (%ebp),%xmm0,%xmm0
+ vpaddd 16(%ebp),%xmm1,%xmm1
+ vpaddd 32(%ebp),%xmm2,%xmm2
+ vpaddd 48(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 16(%esi),%esi
+ vmovdqa 64(%ebx),%xmm0
+ vmovdqa 80(%ebx),%xmm1
+ vmovdqa 96(%ebx),%xmm2
+ vmovdqa 112(%ebx),%xmm3
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ vpaddd 64(%ebp),%xmm0,%xmm0
+ vpaddd 80(%ebp),%xmm1,%xmm1
+ vpaddd 96(%ebp),%xmm2,%xmm2
+ vpaddd 112(%ebp),%xmm3,%xmm3
+ vpunpckldq %xmm1,%xmm0,%xmm6
+ vpunpckldq %xmm3,%xmm2,%xmm7
+ vpunpckhdq %xmm1,%xmm0,%xmm0
+ vpunpckhdq %xmm3,%xmm2,%xmm2
+ vpunpcklqdq %xmm7,%xmm6,%xmm1
+ vpunpckhqdq %xmm7,%xmm6,%xmm6
+ vpunpcklqdq %xmm2,%xmm0,%xmm7
+ vpunpckhqdq %xmm2,%xmm0,%xmm3
+ vpxor -128(%esi),%xmm1,%xmm4
+ vpxor -64(%esi),%xmm6,%xmm5
+ vpxor (%esi),%xmm7,%xmm6
+ vpxor 64(%esi),%xmm3,%xmm7
+ leal 208(%esi),%esi
+ vmovdqu %xmm4,-128(%edi)
+ vmovdqu %xmm5,-64(%edi)
+ vmovdqu %xmm6,(%edi)
+ vmovdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L015outer_loop
+ addl $256,%ecx
+ jz .L017done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ vmovd 64(%ebp),%xmm2
+ vmovdqu (%ebx),%xmm3
+ vpaddd 96(%eax),%xmm2,%xmm2
+ vpand 112(%eax),%xmm3,%xmm3
+ vpor %xmm2,%xmm3,%xmm3
+.L0141x:
+ vmovdqa 32(%eax),%xmm0
+ vmovdqu (%edx),%xmm1
+ vmovdqu 16(%edx),%xmm2
+ vmovdqa (%eax),%xmm6
+ vmovdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L018loop1x
+.align 16
+.L019outer1x:
+ vmovdqa 80(%eax),%xmm3
+ vmovdqa (%esp),%xmm0
+ vmovdqa 16(%esp),%xmm1
+ vmovdqa 32(%esp),%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ movl $10,%edx
+ vmovdqa %xmm3,48(%esp)
+ jmp .L018loop1x
+.align 16
+.L018loop1x:
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $57,%xmm1,%xmm1
+ vpshufd $147,%xmm3,%xmm3
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,16
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,12
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+.byte 143,232,120,194,219,8
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+.byte 143,232,120,194,201,7
+ vpshufd $78,%xmm2,%xmm2
+ vpshufd $147,%xmm1,%xmm1
+ vpshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L018loop1x
+ vpaddd (%esp),%xmm0,%xmm0
+ vpaddd 16(%esp),%xmm1,%xmm1
+ vpaddd 32(%esp),%xmm2,%xmm2
+ vpaddd 48(%esp),%xmm3,%xmm3
+ cmpl $64,%ecx
+ jb .L020tail
+ vpxor (%esi),%xmm0,%xmm0
+ vpxor 16(%esi),%xmm1,%xmm1
+ vpxor 32(%esi),%xmm2,%xmm2
+ vpxor 48(%esi),%xmm3,%xmm3
+ leal 64(%esi),%esi
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L019outer1x
+ jmp .L017done
+.L020tail:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L021tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L021tail_loop
+.L017done:
+ vzeroupper
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_xop,.-.L_ChaCha20_xop_begin
.comm OPENSSL_ia32cap_P,16,4
#endif
diff --git a/secure/lib/libcrypto/i386/poly1305-x86.S b/secure/lib/libcrypto/i386/poly1305-x86.S
index b394500278d5..100deee40bf2 100644
--- a/secure/lib/libcrypto/i386/poly1305-x86.S
+++ b/secure/lib/libcrypto/i386/poly1305-x86.S
@@ -36,6 +36,10 @@ poly1305_init:
jne .L002no_sse2
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
+ movl 8(%edi),%ecx
+ testl $32,%ecx
+ jz .L002no_sse2
+ leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
.L002no_sse2:
movl 20(%esp),%edi
movl %eax,(%ebp)
@@ -1344,6 +1348,557 @@ _poly1305_emit_sse2:
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
+.align 32
+.type _poly1305_init_avx2,@function
+.align 16
+_poly1305_init_avx2:
+ vmovdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ vmovdqa 64(%ebx),%xmm7
+ vpand %xmm7,%xmm4,%xmm0
+ vpsrlq $26,%xmm4,%xmm1
+ vpsrldq $6,%xmm4,%xmm3
+ vpand %xmm7,%xmm1,%xmm1
+ vpsrlq $4,%xmm3,%xmm2
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm7,%xmm2,%xmm2
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrldq $13,%xmm4,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L018square:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ vmovdqa %xmm4,64(%esp)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqa %xmm6,80(%esp)
+ vmovdqa %xmm5,96(%esp)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqa %xmm6,112(%esp)
+ vmovdqa %xmm5,128(%esp)
+ vpshufd $68,%xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vpshufd $68,%xmm1,%xmm1
+ vpshufd $68,%xmm2,%xmm2
+ vpshufd $68,%xmm3,%xmm3
+ vpshufd $68,%xmm4,%xmm4
+ vmovdqa %xmm5,(%edx)
+ vmovdqa %xmm1,16(%edx)
+ vmovdqa %xmm2,32(%edx)
+ vmovdqa %xmm3,48(%edx)
+ vmovdqa %xmm4,64(%edx)
+ vpmuludq %xmm0,%xmm4,%xmm4
+ vpmuludq %xmm0,%xmm3,%xmm3
+ vpmuludq %xmm0,%xmm2,%xmm2
+ vpmuludq %xmm0,%xmm1,%xmm1
+ vpmuludq %xmm0,%xmm5,%xmm0
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpmuludq 32(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vmovdqa 80(%esp),%xmm7
+ vpmuludq (%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 32(%esp),%xmm5
+ vpmuludq 64(%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm4,%xmm4
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vmovdqa 96(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm1,%xmm1
+ vmovdqa 48(%esp),%xmm5
+ vpmuludq 48(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vmovdqa 112(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm3,%xmm3
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm2,%xmm2
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm1,%xmm1
+ vmovdqa 64(%esp),%xmm7
+ vpmuludq 32(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vmovdqa 128(%esp),%xmm5
+ vpmuludq (%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vpmuludq 64(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm7
+ vpmuludq 48(%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpsrlq $26,%xmm3,%xmm5
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrlq $26,%xmm0,%xmm6
+ vpand %xmm7,%xmm0,%xmm0
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm6,%xmm1,%xmm1
+ vpsrlq $26,%xmm4,%xmm5
+ vpand %xmm7,%xmm4,%xmm4
+ vpsrlq $26,%xmm1,%xmm6
+ vpand %xmm7,%xmm1,%xmm1
+ vpaddq %xmm6,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpsllq $2,%xmm5,%xmm5
+ vpsrlq $26,%xmm2,%xmm6
+ vpand %xmm7,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpaddd %xmm6,%xmm3,%xmm3
+ vpsrlq $26,%xmm3,%xmm6
+ vpsrlq $26,%xmm0,%xmm5
+ vpand %xmm7,%xmm0,%xmm0
+ vpand %xmm7,%xmm3,%xmm3
+ vpaddd %xmm5,%xmm1,%xmm1
+ vpaddd %xmm6,%xmm4,%xmm4
+ decl %ecx
+ jz .L019square_break
+ vpunpcklqdq (%esp),%xmm0,%xmm0
+ vpunpcklqdq 16(%esp),%xmm1,%xmm1
+ vpunpcklqdq 32(%esp),%xmm2,%xmm2
+ vpunpcklqdq 48(%esp),%xmm3,%xmm3
+ vpunpcklqdq 64(%esp),%xmm4,%xmm4
+ jmp .L018square
+.L019square_break:
+ vpsllq $32,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm1
+ vpsllq $32,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm3
+ vpsllq $32,%xmm4,%xmm4
+ vpor (%esp),%xmm0,%xmm0
+ vpor 16(%esp),%xmm1,%xmm1
+ vpor 32(%esp),%xmm2,%xmm2
+ vpor 48(%esp),%xmm3,%xmm3
+ vpor 64(%esp),%xmm4,%xmm4
+ vpshufd $141,%xmm0,%xmm0
+ vpshufd $141,%xmm1,%xmm1
+ vpshufd $141,%xmm2,%xmm2
+ vpshufd $141,%xmm3,%xmm3
+ vpshufd $141,%xmm4,%xmm4
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ vmovdqu %xmm4,64(%edi)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqu %xmm6,80(%edi)
+ vmovdqu %xmm5,96(%edi)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm6,112(%edi)
+ vmovdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_avx2,.-_poly1305_init_avx2
+.align 32
+.type _poly1305_blocks_avx2,@function
+.align 16
+_poly1305_blocks_avx2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L020nodata
+ cmpl $64,%ecx
+ jae .L021enter_avx2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.L021enter_avx2:
+ vzeroupper
+ call .L022pic_point
+.L022pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L023base2_26
+ call _poly1305_init_avx2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movl %eax,(%edi)
+ movl %ecx,4(%edi)
+ movl %edx,8(%edi)
+ movl %esi,12(%edi)
+ movl %ebp,16(%edi)
+ movl $1,20(%edi)
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+.L023base2_26:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $448,%esp
+ andl $-512,%esp
+ vmovdqu 48(%edi),%xmm0
+ leal 288(%esp),%edx
+ vmovdqu 64(%edi),%xmm1
+ vmovdqu 80(%edi),%xmm2
+ vmovdqu 96(%edi),%xmm3
+ vmovdqu 112(%edi),%xmm4
+ leal 48(%edi),%edi
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpermq $64,%ymm4,%ymm4
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vpshufd $200,%ymm4,%ymm4
+ vmovdqa %ymm0,-128(%edx)
+ vmovdqu 80(%edi),%xmm0
+ vmovdqa %ymm1,-96(%edx)
+ vmovdqu 96(%edi),%xmm1
+ vmovdqa %ymm2,-64(%edx)
+ vmovdqu 112(%edi),%xmm2
+ vmovdqa %ymm3,-32(%edx)
+ vmovdqu 128(%edi),%xmm3
+ vmovdqa %ymm4,(%edx)
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vmovdqa %ymm0,32(%edx)
+ vmovd -48(%edi),%xmm0
+ vmovdqa %ymm1,64(%edx)
+ vmovd -44(%edi),%xmm1
+ vmovdqa %ymm2,96(%edx)
+ vmovd -40(%edi),%xmm2
+ vmovdqa %ymm3,128(%edx)
+ vmovd -36(%edi),%xmm3
+ vmovd -32(%edi),%xmm4
+ vmovdqa 64(%ebx),%ymm7
+ negl %eax
+ testl $63,%ecx
+ jz .L024even
+ movl %ecx,%edx
+ andl $-64,%ecx
+ andl $63,%edx
+ vmovdqu (%esi),%xmm5
+ cmpl $32,%edx
+ jb .L025one
+ vmovdqu 16(%esi),%xmm6
+ je .L026two
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ leal 48(%esi),%esi
+ leal 8(%ebx),%ebx
+ leal 296(%esp),%edx
+ jmp .L027tail
+.L026two:
+ leal 32(%esi),%esi
+ leal 16(%ebx),%ebx
+ leal 304(%esp),%edx
+ jmp .L027tail
+.L025one:
+ leal 16(%esi),%esi
+ vpxor %ymm6,%ymm6,%ymm6
+ leal 32(%ebx,%eax,8),%ebx
+ leal 312(%esp),%edx
+ jmp .L027tail
+.align 32
+.L024even:
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jz .L027tail
+.L028loop:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -64(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 96(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 128(%edx),%ymm2,%ymm1
+ vpmuludq -128(%edx),%ymm2,%ymm2
+ vpmuludq -32(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq (%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -96(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -64(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -64(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -32(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 128(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -128(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -128(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 64(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 128(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 32(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 64(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 96(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jnz .L028loop
+.L027tail:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ andl $-64,%ebx
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -60(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 100(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 132(%edx),%ymm2,%ymm1
+ vpmuludq -124(%edx),%ymm2,%ymm2
+ vpmuludq -28(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 4(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -92(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -60(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -60(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -28(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 132(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -124(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -124(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -92(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 68(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 100(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 132(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 132(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 36(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 68(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 100(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrldq $8,%ymm4,%ymm5
+ vpsrldq $8,%ymm3,%ymm6
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpsrldq $8,%ymm0,%ymm5
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrldq $8,%ymm1,%ymm6
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsrldq $8,%ymm2,%ymm5
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpermq $2,%ymm4,%ymm6
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpermq $2,%ymm3,%ymm5
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpermq $2,%ymm0,%ymm6
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpermq $2,%ymm1,%ymm5
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpermq $2,%ymm2,%ymm6
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ cmpl $0,%ecx
+ je .L029done
+ vpshufd $252,%xmm0,%xmm0
+ leal 288(%esp),%edx
+ vpshufd $252,%xmm1,%xmm1
+ vpshufd $252,%xmm2,%xmm2
+ vpshufd $252,%xmm3,%xmm3
+ vpshufd $252,%xmm4,%xmm4
+ jmp .L024even
+.align 16
+.L029done:
+ vmovd %xmm0,-48(%edi)
+ vmovd %xmm1,-44(%edi)
+ vmovd %xmm2,-40(%edi)
+ vmovd %xmm3,-36(%edi)
+ vmovd %xmm4,-32(%edi)
+ vzeroupper
+ movl %ebp,%esp
+.L020nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0
@@ -1392,6 +1947,10 @@ poly1305_init:
jne .L002no_sse2
leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx
+ movl 8(%edi),%ecx
+ testl $32,%ecx
+ jz .L002no_sse2
+ leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
.L002no_sse2:
movl 20(%esp),%edi
movl %eax,(%ebp)
@@ -2700,6 +3259,557 @@ _poly1305_emit_sse2:
popl %ebp
ret
.size _poly1305_emit_sse2,.-_poly1305_emit_sse2
+.align 32
+.type _poly1305_init_avx2,@function
+.align 16
+_poly1305_init_avx2:
+ vmovdqu 24(%edi),%xmm4
+ leal 48(%edi),%edi
+ movl %esp,%ebp
+ subl $224,%esp
+ andl $-16,%esp
+ vmovdqa 64(%ebx),%xmm7
+ vpand %xmm7,%xmm4,%xmm0
+ vpsrlq $26,%xmm4,%xmm1
+ vpsrldq $6,%xmm4,%xmm3
+ vpand %xmm7,%xmm1,%xmm1
+ vpsrlq $4,%xmm3,%xmm2
+ vpsrlq $30,%xmm3,%xmm3
+ vpand %xmm7,%xmm2,%xmm2
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrldq $13,%xmm4,%xmm4
+ leal 144(%esp),%edx
+ movl $2,%ecx
+.L018square:
+ vmovdqa %xmm0,(%esp)
+ vmovdqa %xmm1,16(%esp)
+ vmovdqa %xmm2,32(%esp)
+ vmovdqa %xmm3,48(%esp)
+ vmovdqa %xmm4,64(%esp)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqa %xmm6,80(%esp)
+ vmovdqa %xmm5,96(%esp)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqa %xmm6,112(%esp)
+ vmovdqa %xmm5,128(%esp)
+ vpshufd $68,%xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vpshufd $68,%xmm1,%xmm1
+ vpshufd $68,%xmm2,%xmm2
+ vpshufd $68,%xmm3,%xmm3
+ vpshufd $68,%xmm4,%xmm4
+ vmovdqa %xmm5,(%edx)
+ vmovdqa %xmm1,16(%edx)
+ vmovdqa %xmm2,32(%edx)
+ vmovdqa %xmm3,48(%edx)
+ vmovdqa %xmm4,64(%edx)
+ vpmuludq %xmm0,%xmm4,%xmm4
+ vpmuludq %xmm0,%xmm3,%xmm3
+ vpmuludq %xmm0,%xmm2,%xmm2
+ vpmuludq %xmm0,%xmm1,%xmm1
+ vpmuludq %xmm0,%xmm5,%xmm0
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpmuludq 32(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vmovdqa 80(%esp),%xmm7
+ vpmuludq (%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 32(%esp),%xmm5
+ vpmuludq 64(%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm4,%xmm4
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm3,%xmm3
+ vmovdqa 96(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm1,%xmm1
+ vmovdqa 48(%esp),%xmm5
+ vpmuludq 48(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vmovdqa 112(%esp),%xmm6
+ vpmuludq (%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm3,%xmm3
+ vpmuludq 64(%edx),%xmm6,%xmm7
+ vpaddq %xmm7,%xmm2,%xmm2
+ vpmuludq 48(%edx),%xmm6,%xmm5
+ vpaddq %xmm5,%xmm1,%xmm1
+ vmovdqa 64(%esp),%xmm7
+ vpmuludq 32(%edx),%xmm6,%xmm6
+ vpaddq %xmm6,%xmm0,%xmm0
+ vmovdqa 128(%esp),%xmm5
+ vpmuludq (%edx),%xmm7,%xmm7
+ vpaddq %xmm7,%xmm4,%xmm4
+ vpmuludq 64(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm3,%xmm3
+ vpmuludq 16(%edx),%xmm5,%xmm7
+ vpaddq %xmm7,%xmm0,%xmm0
+ vpmuludq 32(%edx),%xmm5,%xmm6
+ vpaddq %xmm6,%xmm1,%xmm1
+ vmovdqa 64(%ebx),%xmm7
+ vpmuludq 48(%edx),%xmm5,%xmm5
+ vpaddq %xmm5,%xmm2,%xmm2
+ vpsrlq $26,%xmm3,%xmm5
+ vpand %xmm7,%xmm3,%xmm3
+ vpsrlq $26,%xmm0,%xmm6
+ vpand %xmm7,%xmm0,%xmm0
+ vpaddq %xmm5,%xmm4,%xmm4
+ vpaddq %xmm6,%xmm1,%xmm1
+ vpsrlq $26,%xmm4,%xmm5
+ vpand %xmm7,%xmm4,%xmm4
+ vpsrlq $26,%xmm1,%xmm6
+ vpand %xmm7,%xmm1,%xmm1
+ vpaddq %xmm6,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpsllq $2,%xmm5,%xmm5
+ vpsrlq $26,%xmm2,%xmm6
+ vpand %xmm7,%xmm2,%xmm2
+ vpaddd %xmm5,%xmm0,%xmm0
+ vpaddd %xmm6,%xmm3,%xmm3
+ vpsrlq $26,%xmm3,%xmm6
+ vpsrlq $26,%xmm0,%xmm5
+ vpand %xmm7,%xmm0,%xmm0
+ vpand %xmm7,%xmm3,%xmm3
+ vpaddd %xmm5,%xmm1,%xmm1
+ vpaddd %xmm6,%xmm4,%xmm4
+ decl %ecx
+ jz .L019square_break
+ vpunpcklqdq (%esp),%xmm0,%xmm0
+ vpunpcklqdq 16(%esp),%xmm1,%xmm1
+ vpunpcklqdq 32(%esp),%xmm2,%xmm2
+ vpunpcklqdq 48(%esp),%xmm3,%xmm3
+ vpunpcklqdq 64(%esp),%xmm4,%xmm4
+ jmp .L018square
+.L019square_break:
+ vpsllq $32,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm1
+ vpsllq $32,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm3
+ vpsllq $32,%xmm4,%xmm4
+ vpor (%esp),%xmm0,%xmm0
+ vpor 16(%esp),%xmm1,%xmm1
+ vpor 32(%esp),%xmm2,%xmm2
+ vpor 48(%esp),%xmm3,%xmm3
+ vpor 64(%esp),%xmm4,%xmm4
+ vpshufd $141,%xmm0,%xmm0
+ vpshufd $141,%xmm1,%xmm1
+ vpshufd $141,%xmm2,%xmm2
+ vpshufd $141,%xmm3,%xmm3
+ vpshufd $141,%xmm4,%xmm4
+ vmovdqu %xmm0,(%edi)
+ vmovdqu %xmm1,16(%edi)
+ vmovdqu %xmm2,32(%edi)
+ vmovdqu %xmm3,48(%edi)
+ vmovdqu %xmm4,64(%edi)
+ vpslld $2,%xmm1,%xmm6
+ vpslld $2,%xmm2,%xmm5
+ vpaddd %xmm1,%xmm6,%xmm6
+ vpaddd %xmm2,%xmm5,%xmm5
+ vmovdqu %xmm6,80(%edi)
+ vmovdqu %xmm5,96(%edi)
+ vpslld $2,%xmm3,%xmm6
+ vpslld $2,%xmm4,%xmm5
+ vpaddd %xmm3,%xmm6,%xmm6
+ vpaddd %xmm4,%xmm5,%xmm5
+ vmovdqu %xmm6,112(%edi)
+ vmovdqu %xmm5,128(%edi)
+ movl %ebp,%esp
+ leal -48(%edi),%edi
+ ret
+.size _poly1305_init_avx2,.-_poly1305_init_avx2
+.align 32
+.type _poly1305_blocks_avx2,@function
+.align 16
+_poly1305_blocks_avx2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 20(%edi),%eax
+ andl $-16,%ecx
+ jz .L020nodata
+ cmpl $64,%ecx
+ jae .L021enter_avx2
+ testl %eax,%eax
+ jz .Lenter_blocks
+.L021enter_avx2:
+ vzeroupper
+ call .L022pic_point
+.L022pic_point:
+ popl %ebx
+ leal .Lconst_sse2-.L022pic_point(%ebx),%ebx
+ testl %eax,%eax
+ jnz .L023base2_26
+ call _poly1305_init_avx2
+ movl (%edi),%eax
+ movl 3(%edi),%ecx
+ movl 6(%edi),%edx
+ movl 9(%edi),%esi
+ movl 13(%edi),%ebp
+ shrl $2,%ecx
+ andl $67108863,%eax
+ shrl $4,%edx
+ andl $67108863,%ecx
+ shrl $6,%esi
+ andl $67108863,%edx
+ movl %eax,(%edi)
+ movl %ecx,4(%edi)
+ movl %edx,8(%edi)
+ movl %esi,12(%edi)
+ movl %ebp,16(%edi)
+ movl $1,20(%edi)
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+.L023base2_26:
+ movl 32(%esp),%eax
+ movl %esp,%ebp
+ subl $448,%esp
+ andl $-512,%esp
+ vmovdqu 48(%edi),%xmm0
+ leal 288(%esp),%edx
+ vmovdqu 64(%edi),%xmm1
+ vmovdqu 80(%edi),%xmm2
+ vmovdqu 96(%edi),%xmm3
+ vmovdqu 112(%edi),%xmm4
+ leal 48(%edi),%edi
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpermq $64,%ymm4,%ymm4
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vpshufd $200,%ymm4,%ymm4
+ vmovdqa %ymm0,-128(%edx)
+ vmovdqu 80(%edi),%xmm0
+ vmovdqa %ymm1,-96(%edx)
+ vmovdqu 96(%edi),%xmm1
+ vmovdqa %ymm2,-64(%edx)
+ vmovdqu 112(%edi),%xmm2
+ vmovdqa %ymm3,-32(%edx)
+ vmovdqu 128(%edi),%xmm3
+ vmovdqa %ymm4,(%edx)
+ vpermq $64,%ymm0,%ymm0
+ vpermq $64,%ymm1,%ymm1
+ vpermq $64,%ymm2,%ymm2
+ vpermq $64,%ymm3,%ymm3
+ vpshufd $200,%ymm0,%ymm0
+ vpshufd $200,%ymm1,%ymm1
+ vpshufd $200,%ymm2,%ymm2
+ vpshufd $200,%ymm3,%ymm3
+ vmovdqa %ymm0,32(%edx)
+ vmovd -48(%edi),%xmm0
+ vmovdqa %ymm1,64(%edx)
+ vmovd -44(%edi),%xmm1
+ vmovdqa %ymm2,96(%edx)
+ vmovd -40(%edi),%xmm2
+ vmovdqa %ymm3,128(%edx)
+ vmovd -36(%edi),%xmm3
+ vmovd -32(%edi),%xmm4
+ vmovdqa 64(%ebx),%ymm7
+ negl %eax
+ testl $63,%ecx
+ jz .L024even
+ movl %ecx,%edx
+ andl $-64,%ecx
+ andl $63,%edx
+ vmovdqu (%esi),%xmm5
+ cmpl $32,%edx
+ jb .L025one
+ vmovdqu 16(%esi),%xmm6
+ je .L026two
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ leal 48(%esi),%esi
+ leal 8(%ebx),%ebx
+ leal 296(%esp),%edx
+ jmp .L027tail
+.L026two:
+ leal 32(%esi),%esi
+ leal 16(%ebx),%ebx
+ leal 304(%esp),%edx
+ jmp .L027tail
+.L025one:
+ leal 16(%esi),%esi
+ vpxor %ymm6,%ymm6,%ymm6
+ leal 32(%ebx,%eax,8),%ebx
+ leal 312(%esp),%edx
+ jmp .L027tail
+.align 32
+.L024even:
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jz .L027tail
+.L028loop:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -64(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 96(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 128(%edx),%ymm2,%ymm1
+ vpmuludq -128(%edx),%ymm2,%ymm2
+ vpmuludq -32(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq (%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -96(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -64(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -64(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -32(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 128(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -128(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -96(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -128(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 64(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 96(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 128(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 32(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -128(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 64(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 96(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ vmovdqu (%esi),%xmm5
+ vmovdqu 16(%esi),%xmm6
+ vinserti128 $1,32(%esi),%ymm5,%ymm5
+ vinserti128 $1,48(%esi),%ymm6,%ymm6
+ leal 64(%esi),%esi
+ subl $64,%ecx
+ jnz .L028loop
+.L027tail:
+ vmovdqa %ymm2,64(%esp)
+ vpsrldq $6,%ymm5,%ymm2
+ vmovdqa %ymm0,(%esp)
+ vpsrldq $6,%ymm6,%ymm0
+ vmovdqa %ymm1,32(%esp)
+ vpunpckhqdq %ymm6,%ymm5,%ymm1
+ vpunpcklqdq %ymm6,%ymm5,%ymm5
+ vpunpcklqdq %ymm0,%ymm2,%ymm2
+ vpsrlq $30,%ymm2,%ymm0
+ vpsrlq $4,%ymm2,%ymm2
+ vpsrlq $26,%ymm5,%ymm6
+ vpsrlq $40,%ymm1,%ymm1
+ vpand %ymm7,%ymm2,%ymm2
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpor (%ebx),%ymm1,%ymm1
+ andl $-64,%ebx
+ vpaddq 64(%esp),%ymm2,%ymm2
+ vpaddq (%esp),%ymm5,%ymm5
+ vpaddq 32(%esp),%ymm6,%ymm6
+ vpaddq %ymm3,%ymm0,%ymm0
+ vpaddq %ymm4,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm2,%ymm3
+ vmovdqa %ymm6,32(%esp)
+ vpmuludq -60(%edx),%ymm2,%ymm4
+ vmovdqa %ymm0,96(%esp)
+ vpmuludq 100(%edx),%ymm2,%ymm0
+ vmovdqa %ymm1,128(%esp)
+ vpmuludq 132(%edx),%ymm2,%ymm1
+ vpmuludq -124(%edx),%ymm2,%ymm2
+ vpmuludq -28(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 4(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm0,%ymm0
+ vmovdqa 32(%esp),%ymm7
+ vpmuludq -92(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq -60(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpmuludq -60(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpmuludq -28(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpmuludq 132(%edx),%ymm7,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vmovdqa 96(%esp),%ymm6
+ vpmuludq -124(%edx),%ymm7,%ymm5
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpmuludq -92(%edx),%ymm7,%ymm7
+ vpaddq %ymm7,%ymm2,%ymm2
+ vpmuludq -124(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpmuludq -92(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vpmuludq 68(%edx),%ymm6,%ymm5
+ vpaddq %ymm5,%ymm0,%ymm0
+ vmovdqa 128(%esp),%ymm5
+ vpmuludq 100(%edx),%ymm6,%ymm7
+ vpaddq %ymm7,%ymm1,%ymm1
+ vpmuludq 132(%edx),%ymm6,%ymm6
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpmuludq 132(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm3,%ymm3
+ vpmuludq 36(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpmuludq -124(%edx),%ymm5,%ymm7
+ vpaddq %ymm7,%ymm4,%ymm4
+ vmovdqa 64(%ebx),%ymm7
+ vpmuludq 68(%edx),%ymm5,%ymm6
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpmuludq 100(%edx),%ymm5,%ymm5
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpsrldq $8,%ymm4,%ymm5
+ vpsrldq $8,%ymm3,%ymm6
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpsrldq $8,%ymm0,%ymm5
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrldq $8,%ymm1,%ymm6
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsrldq $8,%ymm2,%ymm5
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpermq $2,%ymm4,%ymm6
+ vpaddq %ymm5,%ymm2,%ymm2
+ vpermq $2,%ymm3,%ymm5
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpermq $2,%ymm0,%ymm6
+ vpaddq %ymm5,%ymm3,%ymm3
+ vpermq $2,%ymm1,%ymm5
+ vpaddq %ymm6,%ymm0,%ymm0
+ vpermq $2,%ymm2,%ymm6
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpsrlq $26,%ymm3,%ymm5
+ vpand %ymm7,%ymm3,%ymm3
+ vpsrlq $26,%ymm0,%ymm6
+ vpand %ymm7,%ymm0,%ymm0
+ vpaddq %ymm5,%ymm4,%ymm4
+ vpaddq %ymm6,%ymm1,%ymm1
+ vpsrlq $26,%ymm4,%ymm5
+ vpand %ymm7,%ymm4,%ymm4
+ vpsrlq $26,%ymm1,%ymm6
+ vpand %ymm7,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpsllq $2,%ymm5,%ymm5
+ vpsrlq $26,%ymm2,%ymm6
+ vpand %ymm7,%ymm2,%ymm2
+ vpaddq %ymm5,%ymm0,%ymm0
+ vpaddq %ymm6,%ymm3,%ymm3
+ vpsrlq $26,%ymm3,%ymm6
+ vpsrlq $26,%ymm0,%ymm5
+ vpand %ymm7,%ymm0,%ymm0
+ vpand %ymm7,%ymm3,%ymm3
+ vpaddq %ymm5,%ymm1,%ymm1
+ vpaddq %ymm6,%ymm4,%ymm4
+ cmpl $0,%ecx
+ je .L029done
+ vpshufd $252,%xmm0,%xmm0
+ leal 288(%esp),%edx
+ vpshufd $252,%xmm1,%xmm1
+ vpshufd $252,%xmm2,%xmm2
+ vpshufd $252,%xmm3,%xmm3
+ vpshufd $252,%xmm4,%xmm4
+ jmp .L024even
+.align 16
+.L029done:
+ vmovd %xmm0,-48(%edi)
+ vmovd %xmm1,-44(%edi)
+ vmovd %xmm2,-40(%edi)
+ vmovd %xmm3,-36(%edi)
+ vmovd %xmm4,-32(%edi)
+ vzeroupper
+ movl %ebp,%esp
+.L020nodata:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2
.align 64
.Lconst_sse2:
.long 16777216,0,16777216,0,16777216,0,16777216,0
diff --git a/secure/lib/libcrypto/i386/sha1-586.S b/secure/lib/libcrypto/i386/sha1-586.S
index 49e7482b8161..7e90e2d9b1d2 100644
--- a/secure/lib/libcrypto/i386/sha1-586.S
+++ b/secure/lib/libcrypto/i386/sha1-586.S
@@ -25,6 +25,11 @@ sha1_block_data_order:
jz .L001x86
testl $536870912,%ecx
jnz .Lshaext_shortcut
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
jmp .Lssse3_shortcut
.align 16
.L001x86:
@@ -2782,6 +2787,1176 @@ _sha1_block_data_order_ssse3:
popl %ebp
ret
.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L008pic_point
+.L008pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L009loop
+.align 16
+.L009loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L010done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L009loop
+.align 16
+.L010done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
.align 64
.LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249
@@ -2820,6 +3995,11 @@ sha1_block_data_order:
jz .L001x86
testl $536870912,%ecx
jnz .Lshaext_shortcut
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
jmp .Lssse3_shortcut
.align 16
.L001x86:
@@ -5577,6 +6757,1176 @@ _sha1_block_data_order_ssse3:
popl %ebp
ret
.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L008pic_point
+.L008pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L009loop
+.align 16
+.L009loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L010done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L009loop
+.align 16
+.L010done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
.align 64
.LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249
diff --git a/secure/lib/libcrypto/i386/sha256-586.S b/secure/lib/libcrypto/i386/sha256-586.S
index 5d8476c1e1bb..7b4205352bdf 100644
--- a/secure/lib/libcrypto/i386/sha256-586.S
+++ b/secure/lib/libcrypto/i386/sha256-586.S
@@ -42,12 +42,13 @@ sha256_block_data_order:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
+ je .L005AVX
testl $512,%ebx
- jnz .L005SSSE3
+ jnz .L006SSSE3
.L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae .L006unrolled
+ jae .L007unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -119,7 +120,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00700_15:
+.L00800_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -157,11 +158,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00700_15
+ jne .L00800_15
movl 156(%esp),%ecx
- jmp .L00816_63
+ jmp .L00916_63
.align 16
-.L00816_63:
+.L00916_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -216,7 +217,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00816_63
+ jne .L00916_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -260,7 +261,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L006unrolled:
+.L007unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -277,9 +278,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L009grand_loop
+ jmp .L010grand_loop
.align 16
-.L009grand_loop:
+.L010grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -3159,7 +3160,7 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L009grand_loop
+ jb .L010grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -3178,9 +3179,9 @@ sha256_block_data_order:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp .L010loop_shaext
+ jmp .L011loop_shaext
.align 16
-.L010loop_shaext:
+.L011loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -3350,7 +3351,7 @@ sha256_block_data_order:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz .L010loop_shaext
+ jnz .L011loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -3365,7 +3366,7 @@ sha256_block_data_order:
popl %ebp
ret
.align 32
-.L005SSSE3:
+.L006SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -3384,9 +3385,9 @@ sha256_block_data_order:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp .L011grand_ssse3
+ jmp .L012grand_ssse3
.align 16
-.L011grand_ssse3:
+.L012grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -3409,9 +3410,9 @@ sha256_block_data_order:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp .L012ssse3_00_47
+ jmp .L013ssse3_00_47
.align 16
-.L012ssse3_00_47:
+.L013ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -4054,7 +4055,7 @@ sha256_block_data_order:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne .L012ssse3_00_47
+ jne .L013ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -4568,13 +4569,2218 @@ sha256_block_data_order:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb .L011grand_ssse3
+ jb .L012grand_ssse3
movl 108(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
+.align 32
+.L005AVX:
+ andl $264,%edx
+ cmpl $264,%edx
+ je .L014AVX_BMI
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L015grand_avx
+.align 32
+.L015grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L016avx_00_47
+.align 16
+.L016avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L016avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L015grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L014AVX_BMI:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L017grand_avx_bmi
+.align 32
+.L017grand_avx_bmi:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L018avx_bmi_00_47
+.align 16
+.L018avx_bmi_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ vpaddd %xmm4,%xmm0,%xmm0
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm0,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm1,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ vpaddd %xmm4,%xmm2,%xmm2
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm2,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm3,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L018avx_bmi_00_47
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L017grand_avx_bmi
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
.comm OPENSSL_ia32cap_P,16,4
#else
@@ -4619,12 +6825,13 @@ sha256_block_data_order:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
+ je .L005AVX
testl $512,%ebx
- jnz .L005SSSE3
+ jnz .L006SSSE3
.L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae .L006unrolled
+ jae .L007unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -4696,7 +6903,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00700_15:
+.L00800_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -4734,11 +6941,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00700_15
+ jne .L00800_15
movl 156(%esp),%ecx
- jmp .L00816_63
+ jmp .L00916_63
.align 16
-.L00816_63:
+.L00916_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -4793,7 +7000,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00816_63
+ jne .L00916_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -4837,7 +7044,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L006unrolled:
+.L007unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -4854,9 +7061,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L009grand_loop
+ jmp .L010grand_loop
.align 16
-.L009grand_loop:
+.L010grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -7736,7 +9943,7 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L009grand_loop
+ jb .L010grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -7755,9 +9962,9 @@ sha256_block_data_order:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp .L010loop_shaext
+ jmp .L011loop_shaext
.align 16
-.L010loop_shaext:
+.L011loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -7927,7 +10134,7 @@ sha256_block_data_order:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz .L010loop_shaext
+ jnz .L011loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -7942,7 +10149,7 @@ sha256_block_data_order:
popl %ebp
ret
.align 32
-.L005SSSE3:
+.L006SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -7961,9 +10168,9 @@ sha256_block_data_order:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp .L011grand_ssse3
+ jmp .L012grand_ssse3
.align 16
-.L011grand_ssse3:
+.L012grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -7986,9 +10193,9 @@ sha256_block_data_order:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp .L012ssse3_00_47
+ jmp .L013ssse3_00_47
.align 16
-.L012ssse3_00_47:
+.L013ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -8631,7 +10838,7 @@ sha256_block_data_order:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne .L012ssse3_00_47
+ jne .L013ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -9145,8 +11352,2213 @@ sha256_block_data_order:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb .L011grand_ssse3
+ jb .L012grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005AVX:
+ andl $264,%edx
+ cmpl $264,%edx
+ je .L014AVX_BMI
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L015grand_avx
+.align 32
+.L015grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L016avx_00_47
+.align 16
+.L016avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L016avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L015grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L014AVX_BMI:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L017grand_avx_bmi
+.align 32
+.L017grand_avx_bmi:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L018avx_bmi_00_47
+.align 16
+.L018avx_bmi_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ vpaddd %xmm4,%xmm0,%xmm0
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm0,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm1,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ vpaddd %xmm4,%xmm2,%xmm2
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm2,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm3,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L018avx_bmi_00_47
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L017grand_avx_bmi
movl 108(%esp),%esp
+ vzeroall
popl %edi
popl %esi
popl %ebx