diff options
Diffstat (limited to 'secure/lib/libcrypto/i386/sha256-586.S')
-rw-r--r-- | secure/lib/libcrypto/i386/sha256-586.S | 4496 |
1 files changed, 4454 insertions, 42 deletions
diff --git a/secure/lib/libcrypto/i386/sha256-586.S b/secure/lib/libcrypto/i386/sha256-586.S index 5d8476c1e1bb..7b4205352bdf 100644 --- a/secure/lib/libcrypto/i386/sha256-586.S +++ b/secure/lib/libcrypto/i386/sha256-586.S @@ -42,12 +42,13 @@ sha256_block_data_order: orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L005AVX testl $512,%ebx - jnz .L005SSSE3 + jnz .L006SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L006unrolled + jae .L007unrolled jmp .L002loop .align 16 .L002loop: @@ -119,7 +120,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15: +.L00800_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -157,11 +158,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15 + jne .L00800_15 movl 156(%esp),%ecx - jmp .L00816_63 + jmp .L00916_63 .align 16 -.L00816_63: +.L00916_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -216,7 +217,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63 + jne .L00916_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -260,7 +261,7 @@ sha256_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L006unrolled: +.L007unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -277,9 +278,9 @@ sha256_block_data_order: movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L009grand_loop + jmp .L010grand_loop .align 16 -.L009grand_loop: +.L010grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx @@ -3159,7 +3160,7 @@ sha256_block_data_order: movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L009grand_loop + jb .L010grand_loop movl 108(%esp),%esp popl %edi popl %esi @@ -3178,9 +3179,9 @@ sha256_block_data_order: pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext + jmp .L011loop_shaext .align 16 -.L010loop_shaext: +.L011loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 @@ -3350,7 +3351,7 @@ sha256_block_data_order: .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L010loop_shaext + jnz .L011loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 @@ -3365,7 +3366,7 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L005SSSE3: +.L006SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx @@ -3384,9 +3385,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L012grand_ssse3 .align 16 -.L011grand_ssse3: +.L012grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3409,9 +3410,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L013ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L013ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4054,7 +4055,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L013ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4568,13 +4569,2218 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L012grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret +.align 32 +.L005AVX: + andl $264,%edx + cmpl $264,%edx + je .L014AVX_BMI + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L015grand_avx +.align 32 +.L015grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L016avx_00_47 +.align 16 +.L016avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L016avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L015grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L014AVX_BMI: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L017grand_avx_bmi +.align 32 +.L017grand_avx_bmi: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L018avx_bmi_00_47 +.align 16 +.L018avx_bmi_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + vpaddd %xmm4,%xmm0,%xmm0 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm0,%xmm0 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm0,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + vpaddd (%ebp),%xmm0,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + vpaddd %xmm4,%xmm1,%xmm1 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm1,%xmm1 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm1,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm0,%xmm1,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + vpaddd %xmm4,%xmm2,%xmm2 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm2,%xmm2 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm2,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + vpaddd %xmm4,%xmm3,%xmm3 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm3,%xmm3 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm3,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L018avx_bmi_00_47 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L017grand_avx_bmi + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 #else @@ -4619,12 +6825,13 @@ sha256_block_data_order: orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L005AVX testl $512,%ebx - jnz .L005SSSE3 + jnz .L006SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L006unrolled + jae .L007unrolled jmp .L002loop .align 16 .L002loop: @@ -4696,7 +6903,7 @@ sha256_block_data_order: movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15: +.L00800_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx @@ -4734,11 +6941,11 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15 + jne .L00800_15 movl 156(%esp),%ecx - jmp .L00816_63 + jmp .L00916_63 .align 16 -.L00816_63: +.L00916_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx @@ -4793,7 +7000,7 @@ sha256_block_data_order: addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63 + jne .L00916_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx @@ -4837,7 +7044,7 @@ sha256_block_data_order: .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L006unrolled: +.L007unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp @@ -4854,9 +7061,9 @@ sha256_block_data_order: movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L009grand_loop + jmp .L010grand_loop .align 16 -.L009grand_loop: +.L010grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx @@ -7736,7 +9943,7 @@ sha256_block_data_order: movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L009grand_loop + jb .L010grand_loop movl 108(%esp),%esp popl %edi popl %esi @@ -7755,9 +9962,9 @@ sha256_block_data_order: pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext + jmp .L011loop_shaext .align 16 -.L010loop_shaext: +.L011loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 @@ -7927,7 +10134,7 @@ sha256_block_data_order: .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L010loop_shaext + jnz .L011loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 @@ -7942,7 +10149,7 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L005SSSE3: +.L006SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx @@ -7961,9 +10168,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L012grand_ssse3 .align 16 -.L011grand_ssse3: +.L012grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -7986,9 +10193,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L013ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L013ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -8631,7 +10838,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L013ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -9145,8 +11352,2213 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L012grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L005AVX: + andl $264,%edx + cmpl $264,%edx + je .L014AVX_BMI + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L015grand_avx +.align 32 +.L015grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L016avx_00_47 +.align 16 +.L016avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L016avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L015grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L014AVX_BMI: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L017grand_avx_bmi +.align 32 +.L017grand_avx_bmi: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L018avx_bmi_00_47 +.align 16 +.L018avx_bmi_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + vpaddd %xmm4,%xmm0,%xmm0 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm0,%xmm0 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm0,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + vpaddd (%ebp),%xmm0,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + vpaddd %xmm4,%xmm1,%xmm1 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm1,%xmm1 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm1,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm0,%xmm1,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + vpaddd %xmm4,%xmm2,%xmm2 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm2,%xmm2 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm2,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + vpaddd %xmm4,%xmm3,%xmm3 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm3,%xmm3 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm3,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L018avx_bmi_00_47 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L017grand_avx_bmi movl 108(%esp),%esp + vzeroall popl %edi popl %esi popl %ebx |