aboutsummaryrefslogtreecommitdiff
path: root/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S')
-rw-r--r--secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S4376
1 files changed, 4376 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
index e42a02ebe647..cb9e150db553 100644
--- a/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
+++ b/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S
@@ -8,6 +8,25 @@
.align 16
aesni_cbc_sha256_enc:
.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl $1,%eax
+ cmpq $0,%rdi
+ je .Lprobe
+ movl 0(%r11),%eax
+ movq 4(%r11),%r10
+ btq $61,%r10
+ jc aesni_cbc_sha256_enc_shaext
+ movq %r10,%r11
+ shrq $32,%r11
+
+ testl $2048,%r10d
+ jnz aesni_cbc_sha256_enc_xop
+ andl $296,%r11d
+ cmpl $296,%r11d
+ je aesni_cbc_sha256_enc_avx2
+ andl $268435456,%r10d
+ jnz aesni_cbc_sha256_enc_avx
+ ud2
xorl %eax,%eax
cmpq $0,%rdi
je .Lprobe
@@ -59,3 +78,4360 @@ K256:
.long 0,0,0,0, 0,0,0,0
.byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 64
+.type aesni_cbc_sha256_enc_xop,@function
+.align 64
+aesni_cbc_sha256_enc_xop:
+.cfi_startproc
+.Lxop_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_xop:
+ vzeroall
+
+ movq %rdi,%r12
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r13
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ subq $9,%r14
+
+ movl 0(%r15),%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+
+ vmovdqa 0(%r13,%r14,8),%xmm14
+ vmovdqa 16(%r13,%r14,8),%xmm13
+ vmovdqa 32(%r13,%r14,8),%xmm12
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Lloop_xop
+.align 16
+.Lloop_xop:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi,%r12,1),%xmm0
+ vmovdqu 16(%rsi,%r12,1),%xmm1
+ vmovdqu 32(%rsi,%r12,1),%xmm2
+ vmovdqu 48(%rsi,%r12,1),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%esi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lxop_00_47
+
+.align 16
+.Lxop_00_47:
+ subq $-32*4,%rbp
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%eax
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ vpaddd %xmm7,%xmm0,%xmm0
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+.byte 143,232,120,194,251,13
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ vpsrld $10,%xmm3,%xmm6
+ rorl $2,%r14d
+ addl %esi,%r11d
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %edx,%r13d
+ addl %r11d,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+.byte 143,232,120,194,248,13
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ vpsrld $10,%xmm0,%xmm6
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ vpaddd %xmm7,%xmm1,%xmm1
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+.byte 143,232,120,194,248,13
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ vpsrld $10,%xmm0,%xmm6
+ rorl $2,%r14d
+ addl %esi,%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %r11d,%r13d
+ addl %edx,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%edx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+.byte 143,232,120,194,249,13
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ vpsrld $10,%xmm1,%xmm6
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%eax
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ vpaddd %xmm7,%xmm2,%xmm2
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+.byte 143,232,120,194,249,13
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ vpsrld $10,%xmm1,%xmm6
+ rorl $2,%r14d
+ addl %esi,%r11d
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %edx,%r13d
+ addl %r11d,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+.byte 143,232,120,194,250,13
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ vpsrld $10,%xmm2,%xmm6
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+.byte 143,232,120,194,236,14
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ vpsrld $3,%xmm4,%xmm4
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ vpaddd %xmm7,%xmm3,%xmm3
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+.byte 143,232,120,194,245,11
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+.byte 143,232,120,194,250,13
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ vpsrld $10,%xmm2,%xmm6
+ rorl $2,%r14d
+ addl %esi,%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %r11d,%r13d
+ addl %edx,%r14d
+.byte 143,232,120,194,239,2
+ rorl $14,%r13d
+ movl %r14d,%edx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrldq $8,%xmm7,%xmm7
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+.byte 143,232,120,194,251,13
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ vpsrld $10,%xmm3,%xmm6
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+.byte 143,232,120,194,239,2
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ vpxor %xmm6,%xmm7,%xmm7
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm5,%xmm7,%xmm7
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ vpslldq $8,%xmm7,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ movq 64+0(%rsp),%r12
+ vpand %xmm14,%xmm11,%xmm11
+ movq 64+8(%rsp),%r15
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r12,1)
+ leaq 16(%r12),%r12
+ cmpb $0,131(%rbp)
+ jne .Lxop_00_47
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ rorl $2,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ rorl $2,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ rorl $11,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ rorl $2,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ rorl $11,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ rorl $6,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ rorl $2,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ rorl $11,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ rorl $2,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ rorl $11,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ rorl $6,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ rorl $2,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ rorl $11,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ rorl $2,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ rorl $11,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ rorl $6,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ rorl $2,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ rorl $11,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ rorl $2,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ rorl $11,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ rorl $6,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ rorl $2,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%r12
+ movq 64+8(%rsp),%r13
+ movq 64+40(%rsp),%r15
+ movq 64+48(%rsp),%rsi
+
+ vpand %xmm14,%xmm11,%xmm11
+ movl %r14d,%eax
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r12
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ jb .Lloop_xop
+
+ movq 64+32(%rsp),%r8
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_xop:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop
+.type aesni_cbc_sha256_enc_avx,@function
+.align 64
+aesni_cbc_sha256_enc_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+ vzeroall
+
+ movq %rdi,%r12
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r13
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ subq $9,%r14
+
+ movl 0(%r15),%eax
+ movl 4(%r15),%ebx
+ movl 8(%r15),%ecx
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+
+ vmovdqa 0(%r13,%r14,8),%xmm14
+ vmovdqa 16(%r13,%r14,8),%xmm13
+ vmovdqa 32(%r13,%r14,8),%xmm12
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi,%r12,1),%xmm0
+ vmovdqu 16(%rsi,%r12,1),%xmm1
+ vmovdqu 32(%rsi,%r12,1),%xmm2
+ vmovdqu 48(%rsi,%r12,1),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%esi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-32*4,%rbp
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ vpshufd $250,%xmm3,%xmm7
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpaddd %xmm6,%xmm0,%xmm0
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ vpshufd $80,%xmm0,%xmm7
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ vpshufd $250,%xmm0,%xmm7
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpaddd %xmm6,%xmm1,%xmm1
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ vpshufd $80,%xmm1,%xmm7
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ vpshufd $250,%xmm1,%xmm7
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpaddd %xmm6,%xmm2,%xmm2
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ vpshufd $80,%xmm2,%xmm7
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ vpshufd $250,%xmm2,%xmm7
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ vpslld $11,%xmm5,%xmm5
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ vpshufd $132,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpsrldq $8,%xmm6,%xmm6
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpaddd %xmm6,%xmm3,%xmm3
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ vpshufd $80,%xmm3,%xmm7
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ vpsrld $10,%xmm7,%xmm6
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpsrlq $17,%xmm7,%xmm7
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpsrlq $2,%xmm7,%xmm7
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ vpshufd $232,%xmm6,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpslldq $8,%xmm6,%xmm6
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ movq 64+0(%rsp),%r12
+ vpand %xmm14,%xmm11,%xmm11
+ movq 64+8(%rsp),%r15
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r12,1)
+ leaq 16(%r12),%r12
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ vmovdqu (%r12),%xmm9
+ movq %r12,64+0(%rsp)
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ xorl %r8d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%r12d
+ xorl %ebx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r11d
+ andl %r15d,%esi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%esi
+ addl %r11d,%edx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ xorl %edx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r12d
+ xorl %eax,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r10d
+ andl %esi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ addl %r10d,%ecx
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ xorl %ecx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%r12d
+ xorl %r11d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r9d
+ andl %r15d,%esi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%esi
+ addl %r9d,%ebx
+ shrdl $2,%r14d,%r14d
+ addl %esi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ xorl %ebx,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r12d
+ xorl %r10d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%r8d
+ andl %esi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ addl %r8d,%eax
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ xorl %eax,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%r12d
+ xorl %r9d,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%edx
+ andl %r15d,%esi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%esi
+ addl %edx,%r11d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ xorl %r11d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r12d
+ xorl %r8d,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ecx
+ andl %esi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ addl %ecx,%r10d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ xorl %r10d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%r12d
+ xorl %edx,%r15d
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%ebx
+ andl %r15d,%esi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%esi
+ addl %ebx,%r9d
+ shrdl $2,%r14d,%r14d
+ addl %esi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ xorl %r9d,%r13d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%esi
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r12d
+ xorl %ecx,%esi
+ shrdl $6,%r13d,%r13d
+ addl %r12d,%eax
+ andl %esi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ addl %eax,%r8d
+ shrdl $2,%r14d,%r14d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%r12
+ movq 64+8(%rsp),%r13
+ movq 64+40(%rsp),%r15
+ movq 64+48(%rsp),%rsi
+
+ vpand %xmm14,%xmm11,%xmm11
+ movl %r14d,%eax
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r12),%r12
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r12
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+ jb .Lloop_avx
+
+ movq 64+32(%rsp),%r8
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx
+.type aesni_cbc_sha256_enc_avx2,@function
+.align 64
+aesni_cbc_sha256_enc_avx2:
+.cfi_startproc
+.Lavx2_shortcut:
+ movq 8(%rsp),%r10
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ subq $576,%rsp
+ andq $-1024,%rsp
+ addq $448,%rsp
+
+ shlq $6,%rdx
+ subq %rdi,%rsi
+ subq %rdi,%r10
+ addq %rdi,%rdx
+
+
+
+ movq %rdx,64+16(%rsp)
+
+ movq %r8,64+32(%rsp)
+ movq %r9,64+40(%rsp)
+ movq %r10,64+48(%rsp)
+ movq %rax,120(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+ vzeroall
+
+ movq %rdi,%r13
+ vpinsrq $1,%rsi,%xmm15,%xmm15
+ leaq 128(%rcx),%rdi
+ leaq K256+544(%rip),%r12
+ movl 240-128(%rdi),%r14d
+ movq %r9,%r15
+ movq %r10,%rsi
+ vmovdqu (%r8),%xmm8
+ leaq -9(%r14),%r14
+
+ vmovdqa 0(%r12,%r14,8),%xmm14
+ vmovdqa 16(%r12,%r14,8),%xmm13
+ vmovdqa 32(%r12,%r14,8),%xmm12
+
+ subq $-64,%r13
+ movl 0(%r15),%eax
+ leaq (%rsi,%r13,1),%r12
+ movl 4(%r15),%ebx
+ cmpq %rdx,%r13
+ movl 8(%r15),%ecx
+ cmoveq %rsp,%r12
+ movl 12(%r15),%edx
+ movl 16(%r15),%r8d
+ movl 20(%r15),%r9d
+ movl 24(%r15),%r10d
+ movl 28(%r15),%r11d
+ vmovdqu 0-128(%rdi),%xmm10
+ jmp .Loop_avx2
+.align 16
+.Loop_avx2:
+ vmovdqa K256+512(%rip),%ymm7
+ vmovdqu -64+0(%rsi,%r13,1),%xmm0
+ vmovdqu -64+16(%rsi,%r13,1),%xmm1
+ vmovdqu -64+32(%rsi,%r13,1),%xmm2
+ vmovdqu -64+48(%rsi,%r13,1),%xmm3
+
+ vinserti128 $1,(%r12),%ymm0,%ymm0
+ vinserti128 $1,16(%r12),%ymm1,%ymm1
+ vpshufb %ymm7,%ymm0,%ymm0
+ vinserti128 $1,32(%r12),%ymm2,%ymm2
+ vpshufb %ymm7,%ymm1,%ymm1
+ vinserti128 $1,48(%r12),%ymm3,%ymm3
+
+ leaq K256(%rip),%rbp
+ vpshufb %ymm7,%ymm2,%ymm2
+ leaq -64(%r13),%r13
+ vpaddd 0(%rbp),%ymm0,%ymm4
+ vpshufb %ymm7,%ymm3,%ymm3
+ vpaddd 32(%rbp),%ymm1,%ymm5
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ vpaddd 96(%rbp),%ymm3,%ymm7
+ vmovdqa %ymm4,0(%rsp)
+ xorl %r14d,%r14d
+ vmovdqa %ymm5,32(%rsp)
+
+ movq 120(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ leaq -64(%rsp),%rsp
+
+
+
+ movq %rsi,-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ movl %ebx,%esi
+ vmovdqa %ymm6,0(%rsp)
+ xorl %ecx,%esi
+ vmovdqa %ymm7,32(%rsp)
+ movl %r9d,%r12d
+ subq $-32*4,%rbp
+ jmp .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm0,%ymm1,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm2,%ymm3,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm0,%ymm0
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ vpshufd $250,%ymm3,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm0,%ymm0
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpaddd %ymm6,%ymm0,%ymm0
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpshufd $80,%ymm0,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ vpaddd %ymm6,%ymm0,%ymm0
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ vpaddd 0(%rbp),%ymm0,%ymm6
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm1,%ymm2,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm3,%ymm0,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm1,%ymm1
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ vpshufd $250,%ymm0,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm1,%ymm1
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpaddd %ymm6,%ymm1,%ymm1
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpshufd $80,%ymm1,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ vpaddd %ymm6,%ymm1,%ymm1
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ vpaddd 32(%rbp),%ymm1,%ymm6
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ leaq -64(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+ pushq 64-8(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+ leaq 8(%rsp),%rsp
+.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+ vpalignr $4,%ymm2,%ymm3,%ymm4
+ addl 0+128(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ vpalignr $4,%ymm0,%ymm1,%ymm7
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ vpsrld $7,%ymm4,%ymm6
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ vpaddd %ymm7,%ymm2,%ymm2
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ vpshufd $250,%ymm1,%ymm7
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 4+128(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ vpslld $11,%ymm5,%ymm5
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ vpaddd %ymm4,%ymm2,%ymm2
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 8+128(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ vpaddd %ymm6,%ymm2,%ymm2
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ vpshufd $80,%ymm2,%ymm7
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 12+128(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ vpaddd %ymm6,%ymm2,%ymm2
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ vpaddd 64(%rbp),%ymm2,%ymm6
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ vmovdqa %ymm6,0(%rsp)
+ vpalignr $4,%ymm3,%ymm0,%ymm4
+ addl 32+128(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ vpalignr $4,%ymm1,%ymm2,%ymm7
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ vpsrld $7,%ymm4,%ymm6
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ vpaddd %ymm7,%ymm3,%ymm3
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ vpsrld $3,%ymm4,%ymm7
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ vpslld $14,%ymm4,%ymm5
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ vpxor %ymm6,%ymm7,%ymm4
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ vpshufd $250,%ymm2,%ymm7
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ vpsrld $11,%ymm6,%ymm6
+ addl 36+128(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ vpslld $11,%ymm5,%ymm5
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ vpxor %ymm6,%ymm4,%ymm4
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ vpsrlq $17,%ymm7,%ymm7
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ vpaddd %ymm4,%ymm3,%ymm3
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ vpxor %ymm7,%ymm6,%ymm6
+ addl 40+128(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ vpsrlq $2,%ymm7,%ymm7
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ vpxor %ymm7,%ymm6,%ymm6
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ vpshufd $132,%ymm6,%ymm6
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ vpsrldq $8,%ymm6,%ymm6
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ vpaddd %ymm6,%ymm3,%ymm3
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ vpshufd $80,%ymm3,%ymm7
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ vpsrld $10,%ymm7,%ymm6
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ vpsrlq $17,%ymm7,%ymm7
+ addl 44+128(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ vpsrlq $2,%ymm7,%ymm7
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ vpxor %ymm7,%ymm6,%ymm6
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ vpshufd $232,%ymm6,%ymm6
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ vpslldq $8,%ymm6,%ymm6
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ vpaddd %ymm6,%ymm3,%ymm3
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ vpaddd 96(%rbp),%ymm3,%ymm6
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovdqa %ymm6,32(%rsp)
+ vmovq %xmm15,%r13
+ vpextrq $1,%xmm15,%r15
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r15,%r13,1)
+ leaq 16(%r13),%r13
+ leaq 128(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jne .Lavx2_00_47
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ addl 0+64(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+64(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+64(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+64(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+64(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+64(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+64(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+64(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ addl 0(%rsp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4(%rsp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8(%rsp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12(%rsp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32(%rsp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36(%rsp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40(%rsp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44(%rsp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vpextrq $1,%xmm15,%r12
+ vmovq %xmm15,%r13
+ movq 552(%rsp),%r15
+ addl %r14d,%eax
+ leaq 448(%rsp),%rbp
+
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ vmovdqu %xmm8,(%r12,%r13,1)
+ leaq 16(%r13),%r13
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ addl 28(%r15),%r11d
+
+ movl %eax,0(%r15)
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ cmpq 80(%rbp),%r13
+ je .Ldone_avx2
+
+ xorl %r14d,%r14d
+ movl %ebx,%esi
+ movl %r9d,%r12d
+ xorl %ecx,%esi
+ jmp .Lower_avx2
+.align 16
+.Lower_avx2:
+ vmovdqu (%r13),%xmm9
+ vpinsrq $0,%r13,%xmm15,%xmm15
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vpxor %xmm10,%xmm9,%xmm9
+ vmovdqu 16-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vpxor %xmm8,%xmm9,%xmm9
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 32-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 48-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 64-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 80-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 96-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 112-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ leaq -64(%rbp),%rbp
+ addl 0+16(%rbp),%r11d
+ andl %r8d,%r12d
+ rorxl $25,%r8d,%r13d
+ rorxl $11,%r8d,%r15d
+ leal (%rax,%r14,1),%eax
+ leal (%r11,%r12,1),%r11d
+ andnl %r10d,%r8d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r8d,%r14d
+ leal (%r11,%r12,1),%r11d
+ xorl %r14d,%r13d
+ movl %eax,%r15d
+ rorxl $22,%eax,%r12d
+ leal (%r11,%r13,1),%r11d
+ xorl %ebx,%r15d
+ rorxl $13,%eax,%r14d
+ rorxl $2,%eax,%r13d
+ leal (%rdx,%r11,1),%edx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 128-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ebx,%esi
+ xorl %r13d,%r14d
+ leal (%r11,%rsi,1),%r11d
+ movl %r8d,%r12d
+ addl 4+16(%rbp),%r10d
+ andl %edx,%r12d
+ rorxl $25,%edx,%r13d
+ rorxl $11,%edx,%esi
+ leal (%r11,%r14,1),%r11d
+ leal (%r10,%r12,1),%r10d
+ andnl %r9d,%edx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%edx,%r14d
+ leal (%r10,%r12,1),%r10d
+ xorl %r14d,%r13d
+ movl %r11d,%esi
+ rorxl $22,%r11d,%r12d
+ leal (%r10,%r13,1),%r10d
+ xorl %eax,%esi
+ rorxl $13,%r11d,%r14d
+ rorxl $2,%r11d,%r13d
+ leal (%rcx,%r10,1),%ecx
+ andl %esi,%r15d
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 144-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %eax,%r15d
+ xorl %r13d,%r14d
+ leal (%r10,%r15,1),%r10d
+ movl %edx,%r12d
+ addl 8+16(%rbp),%r9d
+ andl %ecx,%r12d
+ rorxl $25,%ecx,%r13d
+ rorxl $11,%ecx,%r15d
+ leal (%r10,%r14,1),%r10d
+ leal (%r9,%r12,1),%r9d
+ andnl %r8d,%ecx,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%ecx,%r14d
+ leal (%r9,%r12,1),%r9d
+ xorl %r14d,%r13d
+ movl %r10d,%r15d
+ rorxl $22,%r10d,%r12d
+ leal (%r9,%r13,1),%r9d
+ xorl %r11d,%r15d
+ rorxl $13,%r10d,%r14d
+ rorxl $2,%r10d,%r13d
+ leal (%rbx,%r9,1),%ebx
+ andl %r15d,%esi
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 160-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r11d,%esi
+ xorl %r13d,%r14d
+ leal (%r9,%rsi,1),%r9d
+ movl %ecx,%r12d
+ addl 12+16(%rbp),%r8d
+ andl %ebx,%r12d
+ rorxl $25,%ebx,%r13d
+ rorxl $11,%ebx,%esi
+ leal (%r9,%r14,1),%r9d
+ leal (%r8,%r12,1),%r8d
+ andnl %edx,%ebx,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%ebx,%r14d
+ leal (%r8,%r12,1),%r8d
+ xorl %r14d,%r13d
+ movl %r9d,%esi
+ rorxl $22,%r9d,%r12d
+ leal (%r8,%r13,1),%r8d
+ xorl %r10d,%esi
+ rorxl $13,%r9d,%r14d
+ rorxl $2,%r9d,%r13d
+ leal (%rax,%r8,1),%eax
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 176-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r10d,%r15d
+ xorl %r13d,%r14d
+ leal (%r8,%r15,1),%r8d
+ movl %ebx,%r12d
+ addl 32+16(%rbp),%edx
+ andl %eax,%r12d
+ rorxl $25,%eax,%r13d
+ rorxl $11,%eax,%r15d
+ leal (%r8,%r14,1),%r8d
+ leal (%rdx,%r12,1),%edx
+ andnl %ecx,%eax,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%eax,%r14d
+ leal (%rdx,%r12,1),%edx
+ xorl %r14d,%r13d
+ movl %r8d,%r15d
+ rorxl $22,%r8d,%r12d
+ leal (%rdx,%r13,1),%edx
+ xorl %r9d,%r15d
+ rorxl $13,%r8d,%r14d
+ rorxl $2,%r8d,%r13d
+ leal (%r11,%rdx,1),%r11d
+ andl %r15d,%esi
+ vpand %xmm12,%xmm11,%xmm8
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 192-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r9d,%esi
+ xorl %r13d,%r14d
+ leal (%rdx,%rsi,1),%edx
+ movl %eax,%r12d
+ addl 36+16(%rbp),%ecx
+ andl %r11d,%r12d
+ rorxl $25,%r11d,%r13d
+ rorxl $11,%r11d,%esi
+ leal (%rdx,%r14,1),%edx
+ leal (%rcx,%r12,1),%ecx
+ andnl %ebx,%r11d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r11d,%r14d
+ leal (%rcx,%r12,1),%ecx
+ xorl %r14d,%r13d
+ movl %edx,%esi
+ rorxl $22,%edx,%r12d
+ leal (%rcx,%r13,1),%ecx
+ xorl %r8d,%esi
+ rorxl $13,%edx,%r14d
+ rorxl $2,%edx,%r13d
+ leal (%r10,%rcx,1),%r10d
+ andl %esi,%r15d
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 208-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %r8d,%r15d
+ xorl %r13d,%r14d
+ leal (%rcx,%r15,1),%ecx
+ movl %r11d,%r12d
+ addl 40+16(%rbp),%ebx
+ andl %r10d,%r12d
+ rorxl $25,%r10d,%r13d
+ rorxl $11,%r10d,%r15d
+ leal (%rcx,%r14,1),%ecx
+ leal (%rbx,%r12,1),%ebx
+ andnl %eax,%r10d,%r12d
+ xorl %r15d,%r13d
+ rorxl $6,%r10d,%r14d
+ leal (%rbx,%r12,1),%ebx
+ xorl %r14d,%r13d
+ movl %ecx,%r15d
+ rorxl $22,%ecx,%r12d
+ leal (%rbx,%r13,1),%ebx
+ xorl %edx,%r15d
+ rorxl $13,%ecx,%r14d
+ rorxl $2,%ecx,%r13d
+ leal (%r9,%rbx,1),%r9d
+ andl %r15d,%esi
+ vpand %xmm13,%xmm11,%xmm11
+ vaesenc %xmm10,%xmm9,%xmm9
+ vmovdqu 224-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %edx,%esi
+ xorl %r13d,%r14d
+ leal (%rbx,%rsi,1),%ebx
+ movl %r10d,%r12d
+ addl 44+16(%rbp),%eax
+ andl %r9d,%r12d
+ rorxl $25,%r9d,%r13d
+ rorxl $11,%r9d,%esi
+ leal (%rbx,%r14,1),%ebx
+ leal (%rax,%r12,1),%eax
+ andnl %r11d,%r9d,%r12d
+ xorl %esi,%r13d
+ rorxl $6,%r9d,%r14d
+ leal (%rax,%r12,1),%eax
+ xorl %r14d,%r13d
+ movl %ebx,%esi
+ rorxl $22,%ebx,%r12d
+ leal (%rax,%r13,1),%eax
+ xorl %ecx,%esi
+ rorxl $13,%ebx,%r14d
+ rorxl $2,%ebx,%r13d
+ leal (%r8,%rax,1),%r8d
+ andl %esi,%r15d
+ vpor %xmm11,%xmm8,%xmm8
+ vaesenclast %xmm10,%xmm9,%xmm11
+ vmovdqu 0-128(%rdi),%xmm10
+ xorl %r12d,%r14d
+ xorl %ecx,%r15d
+ xorl %r13d,%r14d
+ leal (%rax,%r15,1),%eax
+ movl %r9d,%r12d
+ vmovq %xmm15,%r13
+ vpextrq $1,%xmm15,%r15
+ vpand %xmm14,%xmm11,%xmm11
+ vpor %xmm11,%xmm8,%xmm8
+ leaq -64(%rbp),%rbp
+ vmovdqu %xmm8,(%r15,%r13,1)
+ leaq 16(%r13),%r13
+ cmpq %rsp,%rbp
+ jae .Lower_avx2
+
+ movq 552(%rsp),%r15
+ leaq 64(%r13),%r13
+ movq 560(%rsp),%rsi
+ addl %r14d,%eax
+ leaq 448(%rsp),%rsp
+
+ addl 0(%r15),%eax
+ addl 4(%r15),%ebx
+ addl 8(%r15),%ecx
+ addl 12(%r15),%edx
+ addl 16(%r15),%r8d
+ addl 20(%r15),%r9d
+ addl 24(%r15),%r10d
+ leaq (%rsi,%r13,1),%r12
+ addl 28(%r15),%r11d
+
+ cmpq 64+16(%rsp),%r13
+
+ movl %eax,0(%r15)
+ cmoveq %rsp,%r12
+ movl %ebx,4(%r15)
+ movl %ecx,8(%r15)
+ movl %edx,12(%r15)
+ movl %r8d,16(%r15)
+ movl %r9d,20(%r15)
+ movl %r10d,24(%r15)
+ movl %r11d,28(%r15)
+
+ jbe .Loop_avx2
+ leaq (%rsp),%rbp
+
+
+.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08
+
+.Ldone_avx2:
+ movq 64+32(%rbp),%r8
+ movq 64+56(%rbp),%rsi
+.cfi_def_cfa %rsi,8
+ vmovdqu %xmm8,(%r8)
+ vzeroall
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2
+.type aesni_cbc_sha256_enc_shaext,@function
+.align 32
+aesni_cbc_sha256_enc_shaext:
+.cfi_startproc
+ movq 8(%rsp),%r10
+ leaq K256+128(%rip),%rax
+ movdqu (%r9),%xmm1
+ movdqu 16(%r9),%xmm2
+ movdqa 512-128(%rax),%xmm3
+
+ movl 240(%rcx),%r11d
+ subq %rdi,%rsi
+ movups (%rcx),%xmm15
+ movups (%r8),%xmm6
+ movups 16(%rcx),%xmm4
+ leaq 112(%rcx),%rcx
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%r10),%xmm10
+ movdqu 16(%r10),%xmm11
+ movdqu 32(%r10),%xmm12
+.byte 102,68,15,56,0,211
+ movdqu 48(%r10),%xmm13
+
+ movdqa 0-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 102,68,15,56,0,219
+ movdqa %xmm2,%xmm9
+ movdqa %xmm1,%xmm8
+ movups 0(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 32-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 102,68,15,56,0,227
+ leaq 64(%r10),%r10
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 64-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 102,68,15,56,0,235
+.byte 69,15,56,204,211
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 96-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+.byte 15,56,203,202
+ movdqa 128-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ cmpl $11,%r11d
+ jb .Laesenclast1
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast1
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast1:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,202
+ movups 16(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,0(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movdqa 160-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 192-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+.byte 69,15,56,204,211
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 224-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 256-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ cmpl $11,%r11d
+ jb .Laesenclast2
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast2
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast2:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,202
+ movups 32(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,16(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movdqa 288-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 320-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+.byte 69,15,56,204,211
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm13,%xmm3
+.byte 102,65,15,58,15,220,4
+ paddd %xmm3,%xmm10
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 352-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+.byte 69,15,56,205,213
+.byte 69,15,56,204,220
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,65,15,58,15,221,4
+ paddd %xmm3,%xmm11
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 384-128(%rax),%xmm0
+ paddd %xmm10,%xmm0
+.byte 69,15,56,205,218
+.byte 69,15,56,204,229
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm11,%xmm3
+.byte 102,65,15,58,15,218,4
+ paddd %xmm3,%xmm12
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+ movdqa 416-128(%rax),%xmm0
+ paddd %xmm11,%xmm0
+.byte 69,15,56,205,227
+.byte 69,15,56,204,234
+ cmpl $11,%r11d
+ jb .Laesenclast3
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast3
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast3:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm12,%xmm3
+.byte 102,65,15,58,15,219,4
+ paddd %xmm3,%xmm13
+ movups 48(%rdi),%xmm14
+ xorps %xmm15,%xmm14
+ movups %xmm6,32(%rsi,%rdi,1)
+ xorps %xmm14,%xmm6
+ movups -80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movups -64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 448-128(%rax),%xmm0
+ paddd %xmm12,%xmm0
+.byte 69,15,56,205,236
+ movdqa %xmm7,%xmm3
+ movups -48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups -32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,202
+
+ movdqa 480-128(%rax),%xmm0
+ paddd %xmm13,%xmm0
+ movups -16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ movups 0(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movups 16(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.byte 15,56,203,202
+
+ movups 32(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 48(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ cmpl $11,%r11d
+ jb .Laesenclast4
+ movups 64(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 80(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+ je .Laesenclast4
+ movups 96(%rcx),%xmm4
+ aesenc %xmm5,%xmm6
+ movups 112(%rcx),%xmm5
+ aesenc %xmm4,%xmm6
+.Laesenclast4:
+ aesenclast %xmm5,%xmm6
+ movups 16-112(%rcx),%xmm4
+ nop
+
+ paddd %xmm9,%xmm2
+ paddd %xmm8,%xmm1
+
+ decq %rdx
+ movups %xmm6,48(%rsi,%rdi,1)
+ leaq 64(%rdi),%rdi
+ jnz .Loop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm3
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,211,8
+
+ movups %xmm6,(%r8)
+ movdqu %xmm1,(%r9)
+ movdqu %xmm2,16(%r9)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext