aboutsummaryrefslogtreecommitdiff
path: root/secure/lib/libcrypto/i386/sha1-586.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/i386/sha1-586.S')
-rw-r--r--secure/lib/libcrypto/i386/sha1-586.S2350
1 files changed, 2350 insertions, 0 deletions
diff --git a/secure/lib/libcrypto/i386/sha1-586.S b/secure/lib/libcrypto/i386/sha1-586.S
index 49e7482b8161..7e90e2d9b1d2 100644
--- a/secure/lib/libcrypto/i386/sha1-586.S
+++ b/secure/lib/libcrypto/i386/sha1-586.S
@@ -25,6 +25,11 @@ sha1_block_data_order:
jz .L001x86
testl $536870912,%ecx
jnz .Lshaext_shortcut
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
jmp .Lssse3_shortcut
.align 16
.L001x86:
@@ -2782,6 +2787,1176 @@ _sha1_block_data_order_ssse3:
popl %ebp
ret
.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L008pic_point
+.L008pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L009loop
+.align 16
+.L009loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L010done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L009loop
+.align 16
+.L010done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
.align 64
.LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249
@@ -2820,6 +3995,11 @@ sha1_block_data_order:
jz .L001x86
testl $536870912,%ecx
jnz .Lshaext_shortcut
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
jmp .Lssse3_shortcut
.align 16
.L001x86:
@@ -5577,6 +6757,1176 @@ _sha1_block_data_order_ssse3:
popl %ebp
ret
.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L008pic_point
+.L008pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L008pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L009loop
+.align 16
+.L009loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L010done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L009loop
+.align 16
+.L010done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
.align 64
.LK_XX_XX:
.long 1518500249,1518500249,1518500249,1518500249