diff options
Diffstat (limited to 'sys/crypto/openssl/amd64/rsaz-3k-avxifma.S')
-rw-r--r-- | sys/crypto/openssl/amd64/rsaz-3k-avxifma.S | 1769 |
1 files changed, 1769 insertions, 0 deletions
diff --git a/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S b/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S new file mode 100644 index 000000000000..5d9f97d52bc1 --- /dev/null +++ b/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S @@ -0,0 +1,1769 @@ +/* Do not modify. This file is auto-generated from rsaz-3k-avxifma.pl. */ +.text + +.globl ossl_rsaz_amm52x30_x1_avxifma256 +.type ossl_rsaz_amm52x30_x1_avxifma256,@function +.align 32 +ossl_rsaz_amm52x30_x1_avxifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxor %ymm0,%ymm0,%ymm0 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + vmovapd %ymm0,%ymm8 + vmovapd %ymm0,%ymm9 + vmovapd %ymm0,%ymm10 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + + movl $7,%ebx + +.align 32 +.Lloop7: + movq 0(%r11),%r13 + + vpbroadcastq 0(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + movq 8(%r11),%r13 + + vpbroadcastq 8(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + movq 16(%r11),%r13 + + vpbroadcastq 16(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + movq 24(%r11),%r13 + + vpbroadcastq 24(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + leaq 32(%r11),%r11 + decl %ebx + jne .Lloop7 + movq 0(%r11),%r13 + + vpbroadcastq 0(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + movq 8(%r11),%r13 + + vpbroadcastq 8(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq %r8,%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + + vmovq %r9,%xmm0 + vpbroadcastq %xmm0,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm11 + vpsrlq $52,%ymm7,%ymm12 + vpsrlq $52,%ymm8,%ymm13 + vpsrlq $52,%ymm9,%ymm14 + vpsrlq $52,%ymm10,%ymm15 + + leaq -32(%rsp),%rsp + vmovupd %ymm3,(%rsp) + + + vpermq $144,%ymm15,%ymm15 + vpermq $3,%ymm14,%ymm3 + vblendpd $1,%ymm3,%ymm15,%ymm15 + + vpermq $144,%ymm14,%ymm14 + vpermq $3,%ymm13,%ymm3 + vblendpd $1,%ymm3,%ymm14,%ymm14 + + vpermq $144,%ymm13,%ymm13 + vpermq $3,%ymm12,%ymm3 + vblendpd $1,%ymm3,%ymm13,%ymm13 + + vpermq $144,%ymm12,%ymm12 + vpermq $3,%ymm11,%ymm3 + vblendpd $1,%ymm3,%ymm12,%ymm12 + + vpermq $144,%ymm11,%ymm11 + vpermq $3,%ymm2,%ymm3 + vblendpd $1,%ymm3,%ymm11,%ymm11 + + vpermq $144,%ymm2,%ymm2 + vpermq $3,%ymm1,%ymm3 + vblendpd $1,%ymm3,%ymm2,%ymm2 + + vpermq $144,%ymm1,%ymm1 + vpermq $3,%ymm0,%ymm3 + vblendpd $1,%ymm3,%ymm1,%ymm1 + + vpermq $144,%ymm0,%ymm0 + vpand .Lhigh64x3(%rip),%ymm0,%ymm0 + + vmovupd (%rsp),%ymm3 + leaq 32(%rsp),%rsp + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm11,%ymm6,%ymm6 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + vpaddq %ymm14,%ymm9,%ymm9 + vpaddq %ymm15,%ymm10,%ymm10 + + + + vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 + vmovmskpd %ymm0,%r14d + vmovmskpd %ymm1,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 + vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 + vmovmskpd %ymm2,%r13d + vmovmskpd %ymm11,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 + vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 + vmovmskpd %ymm12,%r12d + vmovmskpd %ymm13,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 + vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 + vmovmskpd %ymm14,%r11d + vmovmskpd %ymm15,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 + vmovmskpd %ymm0,%r9d + vmovmskpd %ymm1,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 + vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 + vmovmskpd %ymm2,%r8d + vmovmskpd %ymm11,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 + vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 + vmovmskpd %ymm12,%edx + vmovmskpd %ymm13,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 + vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 + vmovmskpd %ymm14,%ecx + vmovmskpd %ymm15,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + leaq .Lkmasklut(%rip),%rdx + + movb %r14b,%r10b + andq $0xf,%r14 + vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 + shlq $5,%r14 + vmovapd (%rdx,%r14,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 + + movb %r13b,%r10b + andq $0xf,%r13 + vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 + shlq $5,%r13 + vmovapd (%rdx,%r13,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 + + movb %r12b,%r10b + andq $0xf,%r12 + vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 + shlq $5,%r12 + vmovapd (%rdx,%r12,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 + + movb %r11b,%r10b + andq $0xf,%r11 + vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 + shlq $5,%r11 + vmovapd (%rdx,%r11,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm4,32(%rdi) + vmovdqu %ymm5,64(%rdi) + vmovdqu %ymm6,96(%rdi) + vmovdqu %ymm7,128(%rdi) + vmovdqu %ymm8,160(%rdi) + vmovdqu %ymm9,192(%rdi) + vmovdqu %ymm10,224(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x30_x1_avxifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x30_x1_avxifma256, .-ossl_rsaz_amm52x30_x1_avxifma256 +.section .rodata +.align 32 +.Lmask52x4: +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.quad 0xfffffffffffff +.Lhigh64x3: +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.Lkmasklut: + +.quad 0x0 +.quad 0x0 +.quad 0x0 +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 +.quad 0x0 + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 + +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 + +.quad 0x0 +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff + +.quad 0x0 +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff + +.quad 0x0 +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff + +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.quad 0xffffffffffffffff +.text + +.globl ossl_rsaz_amm52x30_x2_avxifma256 +.type ossl_rsaz_amm52x30_x2_avxifma256,@function +.align 32 +ossl_rsaz_amm52x30_x2_avxifma256: +.cfi_startproc +.byte 243,15,30,250 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + vpxor %ymm0,%ymm0,%ymm0 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + vmovapd %ymm0,%ymm8 + vmovapd %ymm0,%ymm9 + vmovapd %ymm0,%ymm10 + + xorl %r9d,%r9d + + movq %rdx,%r11 + movq $0xfffffffffffff,%rax + + movl $30,%ebx + +.align 32 +.Lloop30: + movq 0(%r11),%r13 + + vpbroadcastq 0(%r11),%ymm1 + movq 0(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq (%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 0(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop30 + + pushq %r11 + pushq %rsi + pushq %rcx + pushq %r8 + + vmovq %r9,%xmm0 + vpbroadcastq %xmm0,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm11 + vpsrlq $52,%ymm7,%ymm12 + vpsrlq $52,%ymm8,%ymm13 + vpsrlq $52,%ymm9,%ymm14 + vpsrlq $52,%ymm10,%ymm15 + + leaq -32(%rsp),%rsp + vmovupd %ymm3,(%rsp) + + + vpermq $144,%ymm15,%ymm15 + vpermq $3,%ymm14,%ymm3 + vblendpd $1,%ymm3,%ymm15,%ymm15 + + vpermq $144,%ymm14,%ymm14 + vpermq $3,%ymm13,%ymm3 + vblendpd $1,%ymm3,%ymm14,%ymm14 + + vpermq $144,%ymm13,%ymm13 + vpermq $3,%ymm12,%ymm3 + vblendpd $1,%ymm3,%ymm13,%ymm13 + + vpermq $144,%ymm12,%ymm12 + vpermq $3,%ymm11,%ymm3 + vblendpd $1,%ymm3,%ymm12,%ymm12 + + vpermq $144,%ymm11,%ymm11 + vpermq $3,%ymm2,%ymm3 + vblendpd $1,%ymm3,%ymm11,%ymm11 + + vpermq $144,%ymm2,%ymm2 + vpermq $3,%ymm1,%ymm3 + vblendpd $1,%ymm3,%ymm2,%ymm2 + + vpermq $144,%ymm1,%ymm1 + vpermq $3,%ymm0,%ymm3 + vblendpd $1,%ymm3,%ymm1,%ymm1 + + vpermq $144,%ymm0,%ymm0 + vpand .Lhigh64x3(%rip),%ymm0,%ymm0 + + vmovupd (%rsp),%ymm3 + leaq 32(%rsp),%rsp + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm11,%ymm6,%ymm6 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + vpaddq %ymm14,%ymm9,%ymm9 + vpaddq %ymm15,%ymm10,%ymm10 + + + + vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 + vmovmskpd %ymm0,%r14d + vmovmskpd %ymm1,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 + vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 + vmovmskpd %ymm2,%r13d + vmovmskpd %ymm11,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 + vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 + vmovmskpd %ymm12,%r12d + vmovmskpd %ymm13,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 + vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 + vmovmskpd %ymm14,%r11d + vmovmskpd %ymm15,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 + vmovmskpd %ymm0,%r9d + vmovmskpd %ymm1,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 + vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 + vmovmskpd %ymm2,%r8d + vmovmskpd %ymm11,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 + vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 + vmovmskpd %ymm12,%edx + vmovmskpd %ymm13,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 + vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 + vmovmskpd %ymm14,%ecx + vmovmskpd %ymm15,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + leaq .Lkmasklut(%rip),%rdx + + movb %r14b,%r10b + andq $0xf,%r14 + vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 + shlq $5,%r14 + vmovapd (%rdx,%r14,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 + + movb %r13b,%r10b + andq $0xf,%r13 + vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 + shlq $5,%r13 + vmovapd (%rdx,%r13,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 + + movb %r12b,%r10b + andq $0xf,%r12 + vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 + shlq $5,%r12 + vmovapd (%rdx,%r12,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 + + movb %r11b,%r10b + andq $0xf,%r11 + vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 + shlq $5,%r11 + vmovapd (%rdx,%r11,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + popq %r8 + popq %rcx + popq %rsi + popq %r11 + + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm4,32(%rdi) + vmovdqu %ymm5,64(%rdi) + vmovdqu %ymm6,96(%rdi) + vmovdqu %ymm7,128(%rdi) + vmovdqu %ymm8,160(%rdi) + vmovdqu %ymm9,192(%rdi) + vmovdqu %ymm10,224(%rdi) + + xorl %r15d,%r15d + + leaq 16(%r11),%r11 + movq $0xfffffffffffff,%rax + + movl $30,%ebx + + vpxor %ymm0,%ymm0,%ymm0 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + vmovapd %ymm0,%ymm8 + vmovapd %ymm0,%ymm9 + vmovapd %ymm0,%ymm10 +.align 32 +.Lloop40: + movq 0(%r11),%r13 + + vpbroadcastq 0(%r11),%ymm1 + movq 256(%rsi),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + movq %r12,%r10 + adcq $0,%r10 + + movq 8(%r8),%r13 + imulq %r9,%r13 + andq %rax,%r13 + + vmovq %r13,%xmm2 + vpbroadcastq %xmm2,%ymm2 + movq 256(%rcx),%rdx + mulxq %r13,%r13,%r12 + addq %r13,%r9 + adcq %r12,%r10 + + shrq $52,%r9 + salq $12,%r10 + orq %r10,%r9 + + leaq -264(%rsp),%rsp + +{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm3 +{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm4 +{vex} vpmadd52luq 320(%rsi),%ymm1,%ymm5 +{vex} vpmadd52luq 352(%rsi),%ymm1,%ymm6 +{vex} vpmadd52luq 384(%rsi),%ymm1,%ymm7 +{vex} vpmadd52luq 416(%rsi),%ymm1,%ymm8 +{vex} vpmadd52luq 448(%rsi),%ymm1,%ymm9 +{vex} vpmadd52luq 480(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm3 +{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm4 +{vex} vpmadd52luq 320(%rcx),%ymm2,%ymm5 +{vex} vpmadd52luq 352(%rcx),%ymm2,%ymm6 +{vex} vpmadd52luq 384(%rcx),%ymm2,%ymm7 +{vex} vpmadd52luq 416(%rcx),%ymm2,%ymm8 +{vex} vpmadd52luq 448(%rcx),%ymm2,%ymm9 +{vex} vpmadd52luq 480(%rcx),%ymm2,%ymm10 + + + vmovdqu %ymm3,0(%rsp) + vmovdqu %ymm4,32(%rsp) + vmovdqu %ymm5,64(%rsp) + vmovdqu %ymm6,96(%rsp) + vmovdqu %ymm7,128(%rsp) + vmovdqu %ymm8,160(%rsp) + vmovdqu %ymm9,192(%rsp) + vmovdqu %ymm10,224(%rsp) + movq $0,256(%rsp) + + vmovdqu 8(%rsp),%ymm3 + vmovdqu 40(%rsp),%ymm4 + vmovdqu 72(%rsp),%ymm5 + vmovdqu 104(%rsp),%ymm6 + vmovdqu 136(%rsp),%ymm7 + vmovdqu 168(%rsp),%ymm8 + vmovdqu 200(%rsp),%ymm9 + vmovdqu 232(%rsp),%ymm10 + + addq 8(%rsp),%r9 + +{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm3 +{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm4 +{vex} vpmadd52huq 320(%rsi),%ymm1,%ymm5 +{vex} vpmadd52huq 352(%rsi),%ymm1,%ymm6 +{vex} vpmadd52huq 384(%rsi),%ymm1,%ymm7 +{vex} vpmadd52huq 416(%rsi),%ymm1,%ymm8 +{vex} vpmadd52huq 448(%rsi),%ymm1,%ymm9 +{vex} vpmadd52huq 480(%rsi),%ymm1,%ymm10 + +{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm3 +{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm4 +{vex} vpmadd52huq 320(%rcx),%ymm2,%ymm5 +{vex} vpmadd52huq 352(%rcx),%ymm2,%ymm6 +{vex} vpmadd52huq 384(%rcx),%ymm2,%ymm7 +{vex} vpmadd52huq 416(%rcx),%ymm2,%ymm8 +{vex} vpmadd52huq 448(%rcx),%ymm2,%ymm9 +{vex} vpmadd52huq 480(%rcx),%ymm2,%ymm10 + + leaq 264(%rsp),%rsp + leaq 8(%r11),%r11 + decl %ebx + jne .Lloop40 + + vmovq %r9,%xmm0 + vpbroadcastq %xmm0,%ymm0 + vpblendd $3,%ymm0,%ymm3,%ymm3 + + + + vpsrlq $52,%ymm3,%ymm0 + vpsrlq $52,%ymm4,%ymm1 + vpsrlq $52,%ymm5,%ymm2 + vpsrlq $52,%ymm6,%ymm11 + vpsrlq $52,%ymm7,%ymm12 + vpsrlq $52,%ymm8,%ymm13 + vpsrlq $52,%ymm9,%ymm14 + vpsrlq $52,%ymm10,%ymm15 + + leaq -32(%rsp),%rsp + vmovupd %ymm3,(%rsp) + + + vpermq $144,%ymm15,%ymm15 + vpermq $3,%ymm14,%ymm3 + vblendpd $1,%ymm3,%ymm15,%ymm15 + + vpermq $144,%ymm14,%ymm14 + vpermq $3,%ymm13,%ymm3 + vblendpd $1,%ymm3,%ymm14,%ymm14 + + vpermq $144,%ymm13,%ymm13 + vpermq $3,%ymm12,%ymm3 + vblendpd $1,%ymm3,%ymm13,%ymm13 + + vpermq $144,%ymm12,%ymm12 + vpermq $3,%ymm11,%ymm3 + vblendpd $1,%ymm3,%ymm12,%ymm12 + + vpermq $144,%ymm11,%ymm11 + vpermq $3,%ymm2,%ymm3 + vblendpd $1,%ymm3,%ymm11,%ymm11 + + vpermq $144,%ymm2,%ymm2 + vpermq $3,%ymm1,%ymm3 + vblendpd $1,%ymm3,%ymm2,%ymm2 + + vpermq $144,%ymm1,%ymm1 + vpermq $3,%ymm0,%ymm3 + vblendpd $1,%ymm3,%ymm1,%ymm1 + + vpermq $144,%ymm0,%ymm0 + vpand .Lhigh64x3(%rip),%ymm0,%ymm0 + + vmovupd (%rsp),%ymm3 + leaq 32(%rsp),%rsp + + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + + + vpaddq %ymm0,%ymm3,%ymm3 + vpaddq %ymm1,%ymm4,%ymm4 + vpaddq %ymm2,%ymm5,%ymm5 + vpaddq %ymm11,%ymm6,%ymm6 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + vpaddq %ymm14,%ymm9,%ymm9 + vpaddq %ymm15,%ymm10,%ymm10 + + + + vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1 + vmovmskpd %ymm0,%r14d + vmovmskpd %ymm1,%r13d + shlb $4,%r13b + orb %r13b,%r14b + + vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2 + vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11 + vmovmskpd %ymm2,%r13d + vmovmskpd %ymm11,%r12d + shlb $4,%r12b + orb %r12b,%r13b + + vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12 + vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13 + vmovmskpd %ymm12,%r12d + vmovmskpd %ymm13,%r11d + shlb $4,%r11b + orb %r11b,%r12b + + vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14 + vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15 + vmovmskpd %ymm14,%r11d + vmovmskpd %ymm15,%r10d + shlb $4,%r10b + orb %r10b,%r11b + + addb %r14b,%r14b + adcb %r13b,%r13b + adcb %r12b,%r12b + adcb %r11b,%r11b + + + vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0 + vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1 + vmovmskpd %ymm0,%r9d + vmovmskpd %ymm1,%r8d + shlb $4,%r8b + orb %r8b,%r9b + + vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2 + vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11 + vmovmskpd %ymm2,%r8d + vmovmskpd %ymm11,%edx + shlb $4,%dl + orb %dl,%r8b + + vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12 + vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13 + vmovmskpd %ymm12,%edx + vmovmskpd %ymm13,%ecx + shlb $4,%cl + orb %cl,%dl + + vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14 + vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15 + vmovmskpd %ymm14,%ecx + vmovmskpd %ymm15,%ebx + shlb $4,%bl + orb %bl,%cl + + addb %r9b,%r14b + adcb %r8b,%r13b + adcb %dl,%r12b + adcb %cl,%r11b + + xorb %r9b,%r14b + xorb %r8b,%r13b + xorb %dl,%r12b + xorb %cl,%r11b + + leaq .Lkmasklut(%rip),%rdx + + movb %r14b,%r10b + andq $0xf,%r14 + vpsubq .Lmask52x4(%rip),%ymm3,%ymm0 + shlq $5,%r14 + vmovapd (%rdx,%r14,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm3,%ymm3 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm4,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm4,%ymm4 + + movb %r13b,%r10b + andq $0xf,%r13 + vpsubq .Lmask52x4(%rip),%ymm5,%ymm0 + shlq $5,%r13 + vmovapd (%rdx,%r13,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm5,%ymm5 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm6,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm6,%ymm6 + + movb %r12b,%r10b + andq $0xf,%r12 + vpsubq .Lmask52x4(%rip),%ymm7,%ymm0 + shlq $5,%r12 + vmovapd (%rdx,%r12,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm7,%ymm7 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm8,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm8,%ymm8 + + movb %r11b,%r10b + andq $0xf,%r11 + vpsubq .Lmask52x4(%rip),%ymm9,%ymm0 + shlq $5,%r11 + vmovapd (%rdx,%r11,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm9,%ymm9 + + shrb $4,%r10b + andq $0xf,%r10 + vpsubq .Lmask52x4(%rip),%ymm10,%ymm0 + shlq $5,%r10 + vmovapd (%rdx,%r10,1),%ymm2 + vblendvpd %ymm2,%ymm0,%ymm10,%ymm10 + + vpand .Lmask52x4(%rip),%ymm3,%ymm3 + vpand .Lmask52x4(%rip),%ymm4,%ymm4 + vpand .Lmask52x4(%rip),%ymm5,%ymm5 + vpand .Lmask52x4(%rip),%ymm6,%ymm6 + vpand .Lmask52x4(%rip),%ymm7,%ymm7 + vpand .Lmask52x4(%rip),%ymm8,%ymm8 + vpand .Lmask52x4(%rip),%ymm9,%ymm9 + + vpand .Lmask52x4(%rip),%ymm10,%ymm10 + + vmovdqu %ymm3,256(%rdi) + vmovdqu %ymm4,288(%rdi) + vmovdqu %ymm5,320(%rdi) + vmovdqu %ymm6,352(%rdi) + vmovdqu %ymm7,384(%rdi) + vmovdqu %ymm8,416(%rdi) + vmovdqu %ymm9,448(%rdi) + vmovdqu %ymm10,480(%rdi) + + vzeroupper + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + movq 0(%rax),%r15 +.cfi_restore %r15 + movq 8(%rax),%r14 +.cfi_restore %r14 + movq 16(%rax),%r13 +.cfi_restore %r13 + movq 24(%rax),%r12 +.cfi_restore %r12 + movq 32(%rax),%rbp +.cfi_restore %rbp + movq 40(%rax),%rbx +.cfi_restore %rbx + leaq 48(%rax),%rsp +.cfi_def_cfa %rsp,8 +.Lossl_rsaz_amm52x30_x2_avxifma256_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_rsaz_amm52x30_x2_avxifma256, .-ossl_rsaz_amm52x30_x2_avxifma256 +.text + +.align 32 +.globl ossl_extract_multiplier_2x30_win5_avx +.type ossl_extract_multiplier_2x30_win5_avx,@function +ossl_extract_multiplier_2x30_win5_avx: +.cfi_startproc +.byte 243,15,30,250 + vmovapd .Lones(%rip),%ymm12 + vmovq %rdx,%xmm8 + vpbroadcastq %xmm8,%ymm10 + vmovq %rcx,%xmm8 + vpbroadcastq %xmm8,%ymm11 + leaq 16384(%rsi),%rax + + + vpxor %xmm0,%xmm0,%xmm0 + vmovapd %ymm0,%ymm9 + vmovapd %ymm0,%ymm1 + vmovapd %ymm0,%ymm2 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + +.align 32 +.Lloop: + vpcmpeqq %ymm9,%ymm10,%ymm13 + vmovdqu 0(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm0,%ymm0 + vmovdqu 32(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm2,%ymm2 + vmovdqu 96(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm3,%ymm3 + vmovdqu 128(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm4,%ymm4 + vmovdqu 160(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm5,%ymm5 + vmovdqu 192(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm6,%ymm6 + vmovdqu 224(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm7,%ymm7 + vpaddq %ymm12,%ymm9,%ymm9 + addq $512,%rsi + cmpq %rsi,%rax + jne .Lloop + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm1,32(%rdi) + vmovdqu %ymm2,64(%rdi) + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,128(%rdi) + vmovdqu %ymm5,160(%rdi) + vmovdqu %ymm6,192(%rdi) + vmovdqu %ymm7,224(%rdi) + leaq -16384(%rax),%rsi + + + vpxor %xmm0,%xmm0,%xmm0 + vmovapd %ymm0,%ymm9 + vmovapd %ymm0,%ymm0 + vmovapd %ymm0,%ymm1 + vmovapd %ymm0,%ymm2 + vmovapd %ymm0,%ymm3 + vmovapd %ymm0,%ymm4 + vmovapd %ymm0,%ymm5 + vmovapd %ymm0,%ymm6 + vmovapd %ymm0,%ymm7 + +.align 32 +.Lloop_8_15: + vpcmpeqq %ymm9,%ymm11,%ymm13 + vmovdqu 256(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm0,%ymm0 + vmovdqu 288(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm1,%ymm1 + vmovdqu 320(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm2,%ymm2 + vmovdqu 352(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm3,%ymm3 + vmovdqu 384(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm4,%ymm4 + vmovdqu 416(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm5,%ymm5 + vmovdqu 448(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm6,%ymm6 + vmovdqu 480(%rsi),%ymm8 + + vblendvpd %ymm13,%ymm8,%ymm7,%ymm7 + vpaddq %ymm12,%ymm9,%ymm9 + addq $512,%rsi + cmpq %rsi,%rax + jne .Lloop_8_15 + vmovdqu %ymm0,256(%rdi) + vmovdqu %ymm1,288(%rdi) + vmovdqu %ymm2,320(%rdi) + vmovdqu %ymm3,352(%rdi) + vmovdqu %ymm4,384(%rdi) + vmovdqu %ymm5,416(%rdi) + vmovdqu %ymm6,448(%rdi) + vmovdqu %ymm7,480(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size ossl_extract_multiplier_2x30_win5_avx, .-ossl_extract_multiplier_2x30_win5_avx +.section .rodata +.align 32 +.Lones: +.quad 1,1,1,1 +.Lzeros: +.quad 0,0,0,0 + .section ".note.gnu.property", "a" + .p2align 3 + .long 1f - 0f + .long 4f - 1f + .long 5 +0: + # "GNU" encoded with .byte, since .asciz isn't supported + # on Solaris. + .byte 0x47 + .byte 0x4e + .byte 0x55 + .byte 0 +1: + .p2align 3 + .long 0xc0000002 + .long 3f - 2f +2: + .long 3 +3: + .p2align 3 +4: |