aboutsummaryrefslogtreecommitdiff
path: root/secure/lib/libcrypto/i386/sha256-586.S
diff options
context:
space:
mode:
Diffstat (limited to 'secure/lib/libcrypto/i386/sha256-586.S')
-rw-r--r--secure/lib/libcrypto/i386/sha256-586.S4496
1 files changed, 4454 insertions, 42 deletions
diff --git a/secure/lib/libcrypto/i386/sha256-586.S b/secure/lib/libcrypto/i386/sha256-586.S
index 5d8476c1e1bb..7b4205352bdf 100644
--- a/secure/lib/libcrypto/i386/sha256-586.S
+++ b/secure/lib/libcrypto/i386/sha256-586.S
@@ -42,12 +42,13 @@ sha256_block_data_order:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
+ je .L005AVX
testl $512,%ebx
- jnz .L005SSSE3
+ jnz .L006SSSE3
.L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae .L006unrolled
+ jae .L007unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -119,7 +120,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00700_15:
+.L00800_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -157,11 +158,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00700_15
+ jne .L00800_15
movl 156(%esp),%ecx
- jmp .L00816_63
+ jmp .L00916_63
.align 16
-.L00816_63:
+.L00916_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -216,7 +217,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00816_63
+ jne .L00916_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -260,7 +261,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L006unrolled:
+.L007unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -277,9 +278,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L009grand_loop
+ jmp .L010grand_loop
.align 16
-.L009grand_loop:
+.L010grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -3159,7 +3160,7 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L009grand_loop
+ jb .L010grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -3178,9 +3179,9 @@ sha256_block_data_order:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp .L010loop_shaext
+ jmp .L011loop_shaext
.align 16
-.L010loop_shaext:
+.L011loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -3350,7 +3351,7 @@ sha256_block_data_order:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz .L010loop_shaext
+ jnz .L011loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -3365,7 +3366,7 @@ sha256_block_data_order:
popl %ebp
ret
.align 32
-.L005SSSE3:
+.L006SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -3384,9 +3385,9 @@ sha256_block_data_order:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp .L011grand_ssse3
+ jmp .L012grand_ssse3
.align 16
-.L011grand_ssse3:
+.L012grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -3409,9 +3410,9 @@ sha256_block_data_order:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp .L012ssse3_00_47
+ jmp .L013ssse3_00_47
.align 16
-.L012ssse3_00_47:
+.L013ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -4054,7 +4055,7 @@ sha256_block_data_order:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne .L012ssse3_00_47
+ jne .L013ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -4568,13 +4569,2218 @@ sha256_block_data_order:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb .L011grand_ssse3
+ jb .L012grand_ssse3
movl 108(%esp),%esp
popl %edi
popl %esi
popl %ebx
popl %ebp
ret
+.align 32
+.L005AVX:
+ andl $264,%edx
+ cmpl $264,%edx
+ je .L014AVX_BMI
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L015grand_avx
+.align 32
+.L015grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L016avx_00_47
+.align 16
+.L016avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L016avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L015grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L014AVX_BMI:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L017grand_avx_bmi
+.align 32
+.L017grand_avx_bmi:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L018avx_bmi_00_47
+.align 16
+.L018avx_bmi_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ vpaddd %xmm4,%xmm0,%xmm0
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm0,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm1,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ vpaddd %xmm4,%xmm2,%xmm2
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm2,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm3,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L018avx_bmi_00_47
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L017grand_avx_bmi
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
.comm OPENSSL_ia32cap_P,16,4
#else
@@ -4619,12 +6825,13 @@ sha256_block_data_order:
orl %ebx,%ecx
andl $1342177280,%ecx
cmpl $1342177280,%ecx
+ je .L005AVX
testl $512,%ebx
- jnz .L005SSSE3
+ jnz .L006SSSE3
.L003no_xmm:
subl %edi,%eax
cmpl $256,%eax
- jae .L006unrolled
+ jae .L007unrolled
jmp .L002loop
.align 16
.L002loop:
@@ -4696,7 +6903,7 @@ sha256_block_data_order:
movl %ecx,28(%esp)
movl %edi,32(%esp)
.align 16
-.L00700_15:
+.L00800_15:
movl %edx,%ecx
movl 24(%esp),%esi
rorl $14,%ecx
@@ -4734,11 +6941,11 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3248222580,%esi
- jne .L00700_15
+ jne .L00800_15
movl 156(%esp),%ecx
- jmp .L00816_63
+ jmp .L00916_63
.align 16
-.L00816_63:
+.L00916_63:
movl %ecx,%ebx
movl 104(%esp),%esi
rorl $11,%ecx
@@ -4793,7 +7000,7 @@ sha256_block_data_order:
addl $4,%ebp
addl %ebx,%eax
cmpl $3329325298,%esi
- jne .L00816_63
+ jne .L00916_63
movl 356(%esp),%esi
movl 8(%esp),%ebx
movl 16(%esp),%ecx
@@ -4837,7 +7044,7 @@ sha256_block_data_order:
.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
.byte 62,0
.align 16
-.L006unrolled:
+.L007unrolled:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebp
@@ -4854,9 +7061,9 @@ sha256_block_data_order:
movl %ebx,20(%esp)
movl %ecx,24(%esp)
movl %esi,28(%esp)
- jmp .L009grand_loop
+ jmp .L010grand_loop
.align 16
-.L009grand_loop:
+.L010grand_loop:
movl (%edi),%ebx
movl 4(%edi),%ecx
bswap %ebx
@@ -7736,7 +9943,7 @@ sha256_block_data_order:
movl %ebx,24(%esp)
movl %ecx,28(%esp)
cmpl 104(%esp),%edi
- jb .L009grand_loop
+ jb .L010grand_loop
movl 108(%esp),%esp
popl %edi
popl %esi
@@ -7755,9 +9962,9 @@ sha256_block_data_order:
pshufd $27,%xmm2,%xmm2
.byte 102,15,58,15,202,8
punpcklqdq %xmm0,%xmm2
- jmp .L010loop_shaext
+ jmp .L011loop_shaext
.align 16
-.L010loop_shaext:
+.L011loop_shaext:
movdqu (%edi),%xmm3
movdqu 16(%edi),%xmm4
movdqu 32(%edi),%xmm5
@@ -7927,7 +10134,7 @@ sha256_block_data_order:
.byte 15,56,203,202
paddd 16(%esp),%xmm2
paddd (%esp),%xmm1
- jnz .L010loop_shaext
+ jnz .L011loop_shaext
pshufd $177,%xmm2,%xmm2
pshufd $27,%xmm1,%xmm7
pshufd $177,%xmm1,%xmm1
@@ -7942,7 +10149,7 @@ sha256_block_data_order:
popl %ebp
ret
.align 32
-.L005SSSE3:
+.L006SSSE3:
leal -96(%esp),%esp
movl (%esi),%eax
movl 4(%esi),%ebx
@@ -7961,9 +10168,9 @@ sha256_block_data_order:
movl %ecx,24(%esp)
movl %esi,28(%esp)
movdqa 256(%ebp),%xmm7
- jmp .L011grand_ssse3
+ jmp .L012grand_ssse3
.align 16
-.L011grand_ssse3:
+.L012grand_ssse3:
movdqu (%edi),%xmm0
movdqu 16(%edi),%xmm1
movdqu 32(%edi),%xmm2
@@ -7986,9 +10193,9 @@ sha256_block_data_order:
paddd %xmm3,%xmm7
movdqa %xmm6,64(%esp)
movdqa %xmm7,80(%esp)
- jmp .L012ssse3_00_47
+ jmp .L013ssse3_00_47
.align 16
-.L012ssse3_00_47:
+.L013ssse3_00_47:
addl $64,%ebp
movl %edx,%ecx
movdqa %xmm1,%xmm4
@@ -8631,7 +10838,7 @@ sha256_block_data_order:
addl %ecx,%eax
movdqa %xmm6,80(%esp)
cmpl $66051,64(%ebp)
- jne .L012ssse3_00_47
+ jne .L013ssse3_00_47
movl %edx,%ecx
rorl $14,%edx
movl 20(%esp),%esi
@@ -9145,8 +11352,2213 @@ sha256_block_data_order:
movdqa 64(%ebp),%xmm7
subl $192,%ebp
cmpl 104(%esp),%edi
- jb .L011grand_ssse3
+ jb .L012grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005AVX:
+ andl $264,%edx
+ cmpl $264,%edx
+ je .L014AVX_BMI
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L015grand_avx
+.align 32
+.L015grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L016avx_00_47
+.align 16
+.L016avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L016avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L015grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L014AVX_BMI:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L017grand_avx_bmi
+.align 32
+.L017grand_avx_bmi:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L018avx_bmi_00_47
+.align 16
+.L018avx_bmi_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ vpaddd %xmm4,%xmm0,%xmm0
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm0,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ vpaddd %xmm4,%xmm1,%xmm1
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm1,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ vpaddd %xmm4,%xmm2,%xmm2
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm2,%xmm7
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ vpsrld $7,%xmm4,%xmm6
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrld $3,%xmm4,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpslld $14,%xmm4,%xmm5
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpxor %xmm6,%xmm7,%xmm4
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ vpsrld $11,%xmm6,%xmm6
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpslld $11,%xmm5,%xmm5
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpsrld $10,%xmm7,%xmm6
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpxor %xmm5,%xmm4,%xmm4
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpsrlq $17,%xmm7,%xmm5
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ vpaddd %xmm4,%xmm3,%xmm3
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ vpxor %xmm5,%xmm6,%xmm6
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpsrlq $19,%xmm7,%xmm7
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ vpshufd $132,%xmm6,%xmm7
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ vpsrldq $8,%xmm7,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ vpshufd $80,%xmm3,%xmm7
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ vpsrlq $17,%xmm7,%xmm5
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ vpsrlq $19,%xmm7,%xmm7
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ vpshufd $232,%xmm6,%xmm7
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ vpslldq $8,%xmm7,%xmm7
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L018avx_bmi_00_47
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,16(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 24(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 20(%esp),%edx
+ movl %eax,(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 4(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ andl %eax,%ebx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 12(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,12(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 20(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 16(%esp),%edx
+ movl %ebx,28(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl (%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ andl %ebx,%eax
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 8(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,8(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 16(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 12(%esp),%edx
+ movl %eax,24(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 28(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ andl %eax,%ebx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 4(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,4(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 12(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 8(%esp),%edx
+ movl %ebx,20(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 24(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ andl %ebx,%eax
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl (%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 8(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 4(%esp),%edx
+ movl %eax,16(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 20(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ andl %eax,%ebx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 28(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,28(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 4(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl (%esp),%edx
+ movl %ebx,12(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 16(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ andl %ebx,%eax
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 24(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,24(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl (%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 28(%esp),%edx
+ movl %eax,8(%esp)
+ orl %esi,%edx
+ rorxl $2,%eax,%edi
+ rorxl $13,%eax,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%eax,%ecx
+ xorl %edi,%esi
+ movl 12(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ andl %eax,%ebx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ addl %edx,%ecx
+ addl 20(%esp),%edx
+ leal (%ebx,%ecx,1),%ebx
+ rorxl $6,%edx,%ecx
+ rorxl $11,%edx,%esi
+ movl %edx,20(%esp)
+ rorxl $25,%edx,%edi
+ xorl %esi,%ecx
+ andnl 28(%esp),%edx,%esi
+ xorl %edi,%ecx
+ andl 24(%esp),%edx
+ movl %ebx,4(%esp)
+ orl %esi,%edx
+ rorxl $2,%ebx,%edi
+ rorxl $13,%ebx,%esi
+ leal (%edx,%ecx,1),%edx
+ rorxl $22,%ebx,%ecx
+ xorl %edi,%esi
+ movl 8(%esp),%edi
+ xorl %esi,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ andl %ebx,%eax
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ addl %edx,%ecx
+ addl 16(%esp),%edx
+ leal (%eax,%ecx,1),%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L017grand_avx_bmi
movl 108(%esp),%esp
+ vzeroall
popl %edi
popl %esi
popl %ebx