aboutsummaryrefslogtreecommitdiff
path: root/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S
diff options
context:
space:
mode:
Diffstat (limited to 'sys/crypto/openssl/amd64/rsaz-3k-avxifma.S')
-rw-r--r--sys/crypto/openssl/amd64/rsaz-3k-avxifma.S1769
1 files changed, 1769 insertions, 0 deletions
diff --git a/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S b/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S
new file mode 100644
index 000000000000..5d9f97d52bc1
--- /dev/null
+++ b/sys/crypto/openssl/amd64/rsaz-3k-avxifma.S
@@ -0,0 +1,1769 @@
+/* Do not modify. This file is auto-generated from rsaz-3k-avxifma.pl. */
+.text
+
+.globl ossl_rsaz_amm52x30_x1_avxifma256
+.type ossl_rsaz_amm52x30_x1_avxifma256,@function
+.align 32
+ossl_rsaz_amm52x30_x1_avxifma256:
+.cfi_startproc
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovapd %ymm0,%ymm3
+ vmovapd %ymm0,%ymm4
+ vmovapd %ymm0,%ymm5
+ vmovapd %ymm0,%ymm6
+ vmovapd %ymm0,%ymm7
+ vmovapd %ymm0,%ymm8
+ vmovapd %ymm0,%ymm9
+ vmovapd %ymm0,%ymm10
+
+ xorl %r9d,%r9d
+
+ movq %rdx,%r11
+ movq $0xfffffffffffff,%rax
+
+
+ movl $7,%ebx
+
+.align 32
+.Lloop7:
+ movq 0(%r11),%r13
+
+ vpbroadcastq 0(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ movq 8(%r11),%r13
+
+ vpbroadcastq 8(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ movq 16(%r11),%r13
+
+ vpbroadcastq 16(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ movq 24(%r11),%r13
+
+ vpbroadcastq 24(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ leaq 32(%r11),%r11
+ decl %ebx
+ jne .Lloop7
+ movq 0(%r11),%r13
+
+ vpbroadcastq 0(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ movq 8(%r11),%r13
+
+ vpbroadcastq 8(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq %r8,%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+
+ vmovq %r9,%xmm0
+ vpbroadcastq %xmm0,%ymm0
+ vpblendd $3,%ymm0,%ymm3,%ymm3
+
+
+
+ vpsrlq $52,%ymm3,%ymm0
+ vpsrlq $52,%ymm4,%ymm1
+ vpsrlq $52,%ymm5,%ymm2
+ vpsrlq $52,%ymm6,%ymm11
+ vpsrlq $52,%ymm7,%ymm12
+ vpsrlq $52,%ymm8,%ymm13
+ vpsrlq $52,%ymm9,%ymm14
+ vpsrlq $52,%ymm10,%ymm15
+
+ leaq -32(%rsp),%rsp
+ vmovupd %ymm3,(%rsp)
+
+
+ vpermq $144,%ymm15,%ymm15
+ vpermq $3,%ymm14,%ymm3
+ vblendpd $1,%ymm3,%ymm15,%ymm15
+
+ vpermq $144,%ymm14,%ymm14
+ vpermq $3,%ymm13,%ymm3
+ vblendpd $1,%ymm3,%ymm14,%ymm14
+
+ vpermq $144,%ymm13,%ymm13
+ vpermq $3,%ymm12,%ymm3
+ vblendpd $1,%ymm3,%ymm13,%ymm13
+
+ vpermq $144,%ymm12,%ymm12
+ vpermq $3,%ymm11,%ymm3
+ vblendpd $1,%ymm3,%ymm12,%ymm12
+
+ vpermq $144,%ymm11,%ymm11
+ vpermq $3,%ymm2,%ymm3
+ vblendpd $1,%ymm3,%ymm11,%ymm11
+
+ vpermq $144,%ymm2,%ymm2
+ vpermq $3,%ymm1,%ymm3
+ vblendpd $1,%ymm3,%ymm2,%ymm2
+
+ vpermq $144,%ymm1,%ymm1
+ vpermq $3,%ymm0,%ymm3
+ vblendpd $1,%ymm3,%ymm1,%ymm1
+
+ vpermq $144,%ymm0,%ymm0
+ vpand .Lhigh64x3(%rip),%ymm0,%ymm0
+
+ vmovupd (%rsp),%ymm3
+ leaq 32(%rsp),%rsp
+
+
+ vpand .Lmask52x4(%rip),%ymm3,%ymm3
+ vpand .Lmask52x4(%rip),%ymm4,%ymm4
+ vpand .Lmask52x4(%rip),%ymm5,%ymm5
+ vpand .Lmask52x4(%rip),%ymm6,%ymm6
+ vpand .Lmask52x4(%rip),%ymm7,%ymm7
+ vpand .Lmask52x4(%rip),%ymm8,%ymm8
+ vpand .Lmask52x4(%rip),%ymm9,%ymm9
+ vpand .Lmask52x4(%rip),%ymm10,%ymm10
+
+
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpaddq %ymm1,%ymm4,%ymm4
+ vpaddq %ymm2,%ymm5,%ymm5
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpaddq %ymm14,%ymm9,%ymm9
+ vpaddq %ymm15,%ymm10,%ymm10
+
+
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
+ vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1
+ vmovmskpd %ymm0,%r14d
+ vmovmskpd %ymm1,%r13d
+ shlb $4,%r13b
+ orb %r13b,%r14b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2
+ vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11
+ vmovmskpd %ymm2,%r13d
+ vmovmskpd %ymm11,%r12d
+ shlb $4,%r12b
+ orb %r12b,%r13b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12
+ vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
+ vmovmskpd %ymm12,%r12d
+ vmovmskpd %ymm13,%r11d
+ shlb $4,%r11b
+ orb %r11b,%r12b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14
+ vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15
+ vmovmskpd %ymm14,%r11d
+ vmovmskpd %ymm15,%r10d
+ shlb $4,%r10b
+ orb %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
+ vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1
+ vmovmskpd %ymm0,%r9d
+ vmovmskpd %ymm1,%r8d
+ shlb $4,%r8b
+ orb %r8b,%r9b
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2
+ vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11
+ vmovmskpd %ymm2,%r8d
+ vmovmskpd %ymm11,%edx
+ shlb $4,%dl
+ orb %dl,%r8b
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12
+ vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
+ vmovmskpd %ymm12,%edx
+ vmovmskpd %ymm13,%ecx
+ shlb $4,%cl
+ orb %cl,%dl
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14
+ vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15
+ vmovmskpd %ymm14,%ecx
+ vmovmskpd %ymm15,%ebx
+ shlb $4,%bl
+ orb %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xorb %r9b,%r14b
+ xorb %r8b,%r13b
+ xorb %dl,%r12b
+ xorb %cl,%r11b
+
+ leaq .Lkmasklut(%rip),%rdx
+
+ movb %r14b,%r10b
+ andq $0xf,%r14
+ vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
+ shlq $5,%r14
+ vmovapd (%rdx,%r14,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
+
+ movb %r13b,%r10b
+ andq $0xf,%r13
+ vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
+ shlq $5,%r13
+ vmovapd (%rdx,%r13,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
+
+ movb %r12b,%r10b
+ andq $0xf,%r12
+ vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
+ shlq $5,%r12
+ vmovapd (%rdx,%r12,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
+
+ movb %r11b,%r10b
+ andq $0xf,%r11
+ vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
+ shlq $5,%r11
+ vmovapd (%rdx,%r11,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
+
+ vpand .Lmask52x4(%rip),%ymm3,%ymm3
+ vpand .Lmask52x4(%rip),%ymm4,%ymm4
+ vpand .Lmask52x4(%rip),%ymm5,%ymm5
+ vpand .Lmask52x4(%rip),%ymm6,%ymm6
+ vpand .Lmask52x4(%rip),%ymm7,%ymm7
+ vpand .Lmask52x4(%rip),%ymm8,%ymm8
+ vpand .Lmask52x4(%rip),%ymm9,%ymm9
+
+ vpand .Lmask52x4(%rip),%ymm10,%ymm10
+
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm4,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm6,96(%rdi)
+ vmovdqu %ymm7,128(%rdi)
+ vmovdqu %ymm8,160(%rdi)
+ vmovdqu %ymm9,192(%rdi)
+ vmovdqu %ymm10,224(%rdi)
+
+ vzeroupper
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ movq 0(%rax),%r15
+.cfi_restore %r15
+ movq 8(%rax),%r14
+.cfi_restore %r14
+ movq 16(%rax),%r13
+.cfi_restore %r13
+ movq 24(%rax),%r12
+.cfi_restore %r12
+ movq 32(%rax),%rbp
+.cfi_restore %rbp
+ movq 40(%rax),%rbx
+.cfi_restore %rbx
+ leaq 48(%rax),%rsp
+.cfi_def_cfa %rsp,8
+.Lossl_rsaz_amm52x30_x1_avxifma256_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_rsaz_amm52x30_x1_avxifma256, .-ossl_rsaz_amm52x30_x1_avxifma256
+.section .rodata
+.align 32
+.Lmask52x4:
+.quad 0xfffffffffffff
+.quad 0xfffffffffffff
+.quad 0xfffffffffffff
+.quad 0xfffffffffffff
+.Lhigh64x3:
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.Lkmasklut:
+
+.quad 0x0
+.quad 0x0
+.quad 0x0
+.quad 0x0
+
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0x0
+.quad 0x0
+
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0x0
+
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0x0
+
+.quad 0x0
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0x0
+
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0x0
+
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0x0
+
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0x0
+
+.quad 0x0
+.quad 0x0
+.quad 0x0
+.quad 0xffffffffffffffff
+
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0x0
+.quad 0xffffffffffffffff
+
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0xffffffffffffffff
+
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0xffffffffffffffff
+
+.quad 0x0
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+
+.quad 0xffffffffffffffff
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+
+.quad 0x0
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.quad 0xffffffffffffffff
+.text
+
+.globl ossl_rsaz_amm52x30_x2_avxifma256
+.type ossl_rsaz_amm52x30_x2_avxifma256,@function
+.align 32
+ossl_rsaz_amm52x30_x2_avxifma256:
+.cfi_startproc
+.byte 243,15,30,250
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovapd %ymm0,%ymm3
+ vmovapd %ymm0,%ymm4
+ vmovapd %ymm0,%ymm5
+ vmovapd %ymm0,%ymm6
+ vmovapd %ymm0,%ymm7
+ vmovapd %ymm0,%ymm8
+ vmovapd %ymm0,%ymm9
+ vmovapd %ymm0,%ymm10
+
+ xorl %r9d,%r9d
+
+ movq %rdx,%r11
+ movq $0xfffffffffffff,%rax
+
+ movl $30,%ebx
+
+.align 32
+.Lloop30:
+ movq 0(%r11),%r13
+
+ vpbroadcastq 0(%r11),%ymm1
+ movq 0(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq (%r8),%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 0(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 224(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 0(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 32(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 64(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 96(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 128(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 160(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 192(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 224(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 0(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 32(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 64(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 96(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 128(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 160(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 192(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 224(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ leaq 8(%r11),%r11
+ decl %ebx
+ jne .Lloop30
+
+ pushq %r11
+ pushq %rsi
+ pushq %rcx
+ pushq %r8
+
+ vmovq %r9,%xmm0
+ vpbroadcastq %xmm0,%ymm0
+ vpblendd $3,%ymm0,%ymm3,%ymm3
+
+
+
+ vpsrlq $52,%ymm3,%ymm0
+ vpsrlq $52,%ymm4,%ymm1
+ vpsrlq $52,%ymm5,%ymm2
+ vpsrlq $52,%ymm6,%ymm11
+ vpsrlq $52,%ymm7,%ymm12
+ vpsrlq $52,%ymm8,%ymm13
+ vpsrlq $52,%ymm9,%ymm14
+ vpsrlq $52,%ymm10,%ymm15
+
+ leaq -32(%rsp),%rsp
+ vmovupd %ymm3,(%rsp)
+
+
+ vpermq $144,%ymm15,%ymm15
+ vpermq $3,%ymm14,%ymm3
+ vblendpd $1,%ymm3,%ymm15,%ymm15
+
+ vpermq $144,%ymm14,%ymm14
+ vpermq $3,%ymm13,%ymm3
+ vblendpd $1,%ymm3,%ymm14,%ymm14
+
+ vpermq $144,%ymm13,%ymm13
+ vpermq $3,%ymm12,%ymm3
+ vblendpd $1,%ymm3,%ymm13,%ymm13
+
+ vpermq $144,%ymm12,%ymm12
+ vpermq $3,%ymm11,%ymm3
+ vblendpd $1,%ymm3,%ymm12,%ymm12
+
+ vpermq $144,%ymm11,%ymm11
+ vpermq $3,%ymm2,%ymm3
+ vblendpd $1,%ymm3,%ymm11,%ymm11
+
+ vpermq $144,%ymm2,%ymm2
+ vpermq $3,%ymm1,%ymm3
+ vblendpd $1,%ymm3,%ymm2,%ymm2
+
+ vpermq $144,%ymm1,%ymm1
+ vpermq $3,%ymm0,%ymm3
+ vblendpd $1,%ymm3,%ymm1,%ymm1
+
+ vpermq $144,%ymm0,%ymm0
+ vpand .Lhigh64x3(%rip),%ymm0,%ymm0
+
+ vmovupd (%rsp),%ymm3
+ leaq 32(%rsp),%rsp
+
+
+ vpand .Lmask52x4(%rip),%ymm3,%ymm3
+ vpand .Lmask52x4(%rip),%ymm4,%ymm4
+ vpand .Lmask52x4(%rip),%ymm5,%ymm5
+ vpand .Lmask52x4(%rip),%ymm6,%ymm6
+ vpand .Lmask52x4(%rip),%ymm7,%ymm7
+ vpand .Lmask52x4(%rip),%ymm8,%ymm8
+ vpand .Lmask52x4(%rip),%ymm9,%ymm9
+ vpand .Lmask52x4(%rip),%ymm10,%ymm10
+
+
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpaddq %ymm1,%ymm4,%ymm4
+ vpaddq %ymm2,%ymm5,%ymm5
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpaddq %ymm14,%ymm9,%ymm9
+ vpaddq %ymm15,%ymm10,%ymm10
+
+
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
+ vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1
+ vmovmskpd %ymm0,%r14d
+ vmovmskpd %ymm1,%r13d
+ shlb $4,%r13b
+ orb %r13b,%r14b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2
+ vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11
+ vmovmskpd %ymm2,%r13d
+ vmovmskpd %ymm11,%r12d
+ shlb $4,%r12b
+ orb %r12b,%r13b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12
+ vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
+ vmovmskpd %ymm12,%r12d
+ vmovmskpd %ymm13,%r11d
+ shlb $4,%r11b
+ orb %r11b,%r12b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14
+ vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15
+ vmovmskpd %ymm14,%r11d
+ vmovmskpd %ymm15,%r10d
+ shlb $4,%r10b
+ orb %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
+ vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1
+ vmovmskpd %ymm0,%r9d
+ vmovmskpd %ymm1,%r8d
+ shlb $4,%r8b
+ orb %r8b,%r9b
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2
+ vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11
+ vmovmskpd %ymm2,%r8d
+ vmovmskpd %ymm11,%edx
+ shlb $4,%dl
+ orb %dl,%r8b
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12
+ vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
+ vmovmskpd %ymm12,%edx
+ vmovmskpd %ymm13,%ecx
+ shlb $4,%cl
+ orb %cl,%dl
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14
+ vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15
+ vmovmskpd %ymm14,%ecx
+ vmovmskpd %ymm15,%ebx
+ shlb $4,%bl
+ orb %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xorb %r9b,%r14b
+ xorb %r8b,%r13b
+ xorb %dl,%r12b
+ xorb %cl,%r11b
+
+ leaq .Lkmasklut(%rip),%rdx
+
+ movb %r14b,%r10b
+ andq $0xf,%r14
+ vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
+ shlq $5,%r14
+ vmovapd (%rdx,%r14,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
+
+ movb %r13b,%r10b
+ andq $0xf,%r13
+ vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
+ shlq $5,%r13
+ vmovapd (%rdx,%r13,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
+
+ movb %r12b,%r10b
+ andq $0xf,%r12
+ vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
+ shlq $5,%r12
+ vmovapd (%rdx,%r12,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
+
+ movb %r11b,%r10b
+ andq $0xf,%r11
+ vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
+ shlq $5,%r11
+ vmovapd (%rdx,%r11,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
+
+ vpand .Lmask52x4(%rip),%ymm3,%ymm3
+ vpand .Lmask52x4(%rip),%ymm4,%ymm4
+ vpand .Lmask52x4(%rip),%ymm5,%ymm5
+ vpand .Lmask52x4(%rip),%ymm6,%ymm6
+ vpand .Lmask52x4(%rip),%ymm7,%ymm7
+ vpand .Lmask52x4(%rip),%ymm8,%ymm8
+ vpand .Lmask52x4(%rip),%ymm9,%ymm9
+
+ vpand .Lmask52x4(%rip),%ymm10,%ymm10
+ popq %r8
+ popq %rcx
+ popq %rsi
+ popq %r11
+
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm4,32(%rdi)
+ vmovdqu %ymm5,64(%rdi)
+ vmovdqu %ymm6,96(%rdi)
+ vmovdqu %ymm7,128(%rdi)
+ vmovdqu %ymm8,160(%rdi)
+ vmovdqu %ymm9,192(%rdi)
+ vmovdqu %ymm10,224(%rdi)
+
+ xorl %r15d,%r15d
+
+ leaq 16(%r11),%r11
+ movq $0xfffffffffffff,%rax
+
+ movl $30,%ebx
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovapd %ymm0,%ymm3
+ vmovapd %ymm0,%ymm4
+ vmovapd %ymm0,%ymm5
+ vmovapd %ymm0,%ymm6
+ vmovapd %ymm0,%ymm7
+ vmovapd %ymm0,%ymm8
+ vmovapd %ymm0,%ymm9
+ vmovapd %ymm0,%ymm10
+.align 32
+.Lloop40:
+ movq 0(%r11),%r13
+
+ vpbroadcastq 0(%r11),%ymm1
+ movq 256(%rsi),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ movq %r12,%r10
+ adcq $0,%r10
+
+ movq 8(%r8),%r13
+ imulq %r9,%r13
+ andq %rax,%r13
+
+ vmovq %r13,%xmm2
+ vpbroadcastq %xmm2,%ymm2
+ movq 256(%rcx),%rdx
+ mulxq %r13,%r13,%r12
+ addq %r13,%r9
+ adcq %r12,%r10
+
+ shrq $52,%r9
+ salq $12,%r10
+ orq %r10,%r9
+
+ leaq -264(%rsp),%rsp
+
+{vex} vpmadd52luq 256(%rsi),%ymm1,%ymm3
+{vex} vpmadd52luq 288(%rsi),%ymm1,%ymm4
+{vex} vpmadd52luq 320(%rsi),%ymm1,%ymm5
+{vex} vpmadd52luq 352(%rsi),%ymm1,%ymm6
+{vex} vpmadd52luq 384(%rsi),%ymm1,%ymm7
+{vex} vpmadd52luq 416(%rsi),%ymm1,%ymm8
+{vex} vpmadd52luq 448(%rsi),%ymm1,%ymm9
+{vex} vpmadd52luq 480(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52luq 256(%rcx),%ymm2,%ymm3
+{vex} vpmadd52luq 288(%rcx),%ymm2,%ymm4
+{vex} vpmadd52luq 320(%rcx),%ymm2,%ymm5
+{vex} vpmadd52luq 352(%rcx),%ymm2,%ymm6
+{vex} vpmadd52luq 384(%rcx),%ymm2,%ymm7
+{vex} vpmadd52luq 416(%rcx),%ymm2,%ymm8
+{vex} vpmadd52luq 448(%rcx),%ymm2,%ymm9
+{vex} vpmadd52luq 480(%rcx),%ymm2,%ymm10
+
+
+ vmovdqu %ymm3,0(%rsp)
+ vmovdqu %ymm4,32(%rsp)
+ vmovdqu %ymm5,64(%rsp)
+ vmovdqu %ymm6,96(%rsp)
+ vmovdqu %ymm7,128(%rsp)
+ vmovdqu %ymm8,160(%rsp)
+ vmovdqu %ymm9,192(%rsp)
+ vmovdqu %ymm10,224(%rsp)
+ movq $0,256(%rsp)
+
+ vmovdqu 8(%rsp),%ymm3
+ vmovdqu 40(%rsp),%ymm4
+ vmovdqu 72(%rsp),%ymm5
+ vmovdqu 104(%rsp),%ymm6
+ vmovdqu 136(%rsp),%ymm7
+ vmovdqu 168(%rsp),%ymm8
+ vmovdqu 200(%rsp),%ymm9
+ vmovdqu 232(%rsp),%ymm10
+
+ addq 8(%rsp),%r9
+
+{vex} vpmadd52huq 256(%rsi),%ymm1,%ymm3
+{vex} vpmadd52huq 288(%rsi),%ymm1,%ymm4
+{vex} vpmadd52huq 320(%rsi),%ymm1,%ymm5
+{vex} vpmadd52huq 352(%rsi),%ymm1,%ymm6
+{vex} vpmadd52huq 384(%rsi),%ymm1,%ymm7
+{vex} vpmadd52huq 416(%rsi),%ymm1,%ymm8
+{vex} vpmadd52huq 448(%rsi),%ymm1,%ymm9
+{vex} vpmadd52huq 480(%rsi),%ymm1,%ymm10
+
+{vex} vpmadd52huq 256(%rcx),%ymm2,%ymm3
+{vex} vpmadd52huq 288(%rcx),%ymm2,%ymm4
+{vex} vpmadd52huq 320(%rcx),%ymm2,%ymm5
+{vex} vpmadd52huq 352(%rcx),%ymm2,%ymm6
+{vex} vpmadd52huq 384(%rcx),%ymm2,%ymm7
+{vex} vpmadd52huq 416(%rcx),%ymm2,%ymm8
+{vex} vpmadd52huq 448(%rcx),%ymm2,%ymm9
+{vex} vpmadd52huq 480(%rcx),%ymm2,%ymm10
+
+ leaq 264(%rsp),%rsp
+ leaq 8(%r11),%r11
+ decl %ebx
+ jne .Lloop40
+
+ vmovq %r9,%xmm0
+ vpbroadcastq %xmm0,%ymm0
+ vpblendd $3,%ymm0,%ymm3,%ymm3
+
+
+
+ vpsrlq $52,%ymm3,%ymm0
+ vpsrlq $52,%ymm4,%ymm1
+ vpsrlq $52,%ymm5,%ymm2
+ vpsrlq $52,%ymm6,%ymm11
+ vpsrlq $52,%ymm7,%ymm12
+ vpsrlq $52,%ymm8,%ymm13
+ vpsrlq $52,%ymm9,%ymm14
+ vpsrlq $52,%ymm10,%ymm15
+
+ leaq -32(%rsp),%rsp
+ vmovupd %ymm3,(%rsp)
+
+
+ vpermq $144,%ymm15,%ymm15
+ vpermq $3,%ymm14,%ymm3
+ vblendpd $1,%ymm3,%ymm15,%ymm15
+
+ vpermq $144,%ymm14,%ymm14
+ vpermq $3,%ymm13,%ymm3
+ vblendpd $1,%ymm3,%ymm14,%ymm14
+
+ vpermq $144,%ymm13,%ymm13
+ vpermq $3,%ymm12,%ymm3
+ vblendpd $1,%ymm3,%ymm13,%ymm13
+
+ vpermq $144,%ymm12,%ymm12
+ vpermq $3,%ymm11,%ymm3
+ vblendpd $1,%ymm3,%ymm12,%ymm12
+
+ vpermq $144,%ymm11,%ymm11
+ vpermq $3,%ymm2,%ymm3
+ vblendpd $1,%ymm3,%ymm11,%ymm11
+
+ vpermq $144,%ymm2,%ymm2
+ vpermq $3,%ymm1,%ymm3
+ vblendpd $1,%ymm3,%ymm2,%ymm2
+
+ vpermq $144,%ymm1,%ymm1
+ vpermq $3,%ymm0,%ymm3
+ vblendpd $1,%ymm3,%ymm1,%ymm1
+
+ vpermq $144,%ymm0,%ymm0
+ vpand .Lhigh64x3(%rip),%ymm0,%ymm0
+
+ vmovupd (%rsp),%ymm3
+ leaq 32(%rsp),%rsp
+
+
+ vpand .Lmask52x4(%rip),%ymm3,%ymm3
+ vpand .Lmask52x4(%rip),%ymm4,%ymm4
+ vpand .Lmask52x4(%rip),%ymm5,%ymm5
+ vpand .Lmask52x4(%rip),%ymm6,%ymm6
+ vpand .Lmask52x4(%rip),%ymm7,%ymm7
+ vpand .Lmask52x4(%rip),%ymm8,%ymm8
+ vpand .Lmask52x4(%rip),%ymm9,%ymm9
+ vpand .Lmask52x4(%rip),%ymm10,%ymm10
+
+
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpaddq %ymm1,%ymm4,%ymm4
+ vpaddq %ymm2,%ymm5,%ymm5
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpaddq %ymm14,%ymm9,%ymm9
+ vpaddq %ymm15,%ymm10,%ymm10
+
+
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm3,%ymm0
+ vpcmpgtq .Lmask52x4(%rip),%ymm4,%ymm1
+ vmovmskpd %ymm0,%r14d
+ vmovmskpd %ymm1,%r13d
+ shlb $4,%r13b
+ orb %r13b,%r14b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm5,%ymm2
+ vpcmpgtq .Lmask52x4(%rip),%ymm6,%ymm11
+ vmovmskpd %ymm2,%r13d
+ vmovmskpd %ymm11,%r12d
+ shlb $4,%r12b
+ orb %r12b,%r13b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm7,%ymm12
+ vpcmpgtq .Lmask52x4(%rip),%ymm8,%ymm13
+ vmovmskpd %ymm12,%r12d
+ vmovmskpd %ymm13,%r11d
+ shlb $4,%r11b
+ orb %r11b,%r12b
+
+ vpcmpgtq .Lmask52x4(%rip),%ymm9,%ymm14
+ vpcmpgtq .Lmask52x4(%rip),%ymm10,%ymm15
+ vmovmskpd %ymm14,%r11d
+ vmovmskpd %ymm15,%r10d
+ shlb $4,%r10b
+ orb %r10b,%r11b
+
+ addb %r14b,%r14b
+ adcb %r13b,%r13b
+ adcb %r12b,%r12b
+ adcb %r11b,%r11b
+
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm3,%ymm0
+ vpcmpeqq .Lmask52x4(%rip),%ymm4,%ymm1
+ vmovmskpd %ymm0,%r9d
+ vmovmskpd %ymm1,%r8d
+ shlb $4,%r8b
+ orb %r8b,%r9b
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm5,%ymm2
+ vpcmpeqq .Lmask52x4(%rip),%ymm6,%ymm11
+ vmovmskpd %ymm2,%r8d
+ vmovmskpd %ymm11,%edx
+ shlb $4,%dl
+ orb %dl,%r8b
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm7,%ymm12
+ vpcmpeqq .Lmask52x4(%rip),%ymm8,%ymm13
+ vmovmskpd %ymm12,%edx
+ vmovmskpd %ymm13,%ecx
+ shlb $4,%cl
+ orb %cl,%dl
+
+ vpcmpeqq .Lmask52x4(%rip),%ymm9,%ymm14
+ vpcmpeqq .Lmask52x4(%rip),%ymm10,%ymm15
+ vmovmskpd %ymm14,%ecx
+ vmovmskpd %ymm15,%ebx
+ shlb $4,%bl
+ orb %bl,%cl
+
+ addb %r9b,%r14b
+ adcb %r8b,%r13b
+ adcb %dl,%r12b
+ adcb %cl,%r11b
+
+ xorb %r9b,%r14b
+ xorb %r8b,%r13b
+ xorb %dl,%r12b
+ xorb %cl,%r11b
+
+ leaq .Lkmasklut(%rip),%rdx
+
+ movb %r14b,%r10b
+ andq $0xf,%r14
+ vpsubq .Lmask52x4(%rip),%ymm3,%ymm0
+ shlq $5,%r14
+ vmovapd (%rdx,%r14,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm3,%ymm3
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm4,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm4,%ymm4
+
+ movb %r13b,%r10b
+ andq $0xf,%r13
+ vpsubq .Lmask52x4(%rip),%ymm5,%ymm0
+ shlq $5,%r13
+ vmovapd (%rdx,%r13,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm5,%ymm5
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm6,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm6,%ymm6
+
+ movb %r12b,%r10b
+ andq $0xf,%r12
+ vpsubq .Lmask52x4(%rip),%ymm7,%ymm0
+ shlq $5,%r12
+ vmovapd (%rdx,%r12,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm7,%ymm7
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm8,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm8,%ymm8
+
+ movb %r11b,%r10b
+ andq $0xf,%r11
+ vpsubq .Lmask52x4(%rip),%ymm9,%ymm0
+ shlq $5,%r11
+ vmovapd (%rdx,%r11,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm9,%ymm9
+
+ shrb $4,%r10b
+ andq $0xf,%r10
+ vpsubq .Lmask52x4(%rip),%ymm10,%ymm0
+ shlq $5,%r10
+ vmovapd (%rdx,%r10,1),%ymm2
+ vblendvpd %ymm2,%ymm0,%ymm10,%ymm10
+
+ vpand .Lmask52x4(%rip),%ymm3,%ymm3
+ vpand .Lmask52x4(%rip),%ymm4,%ymm4
+ vpand .Lmask52x4(%rip),%ymm5,%ymm5
+ vpand .Lmask52x4(%rip),%ymm6,%ymm6
+ vpand .Lmask52x4(%rip),%ymm7,%ymm7
+ vpand .Lmask52x4(%rip),%ymm8,%ymm8
+ vpand .Lmask52x4(%rip),%ymm9,%ymm9
+
+ vpand .Lmask52x4(%rip),%ymm10,%ymm10
+
+ vmovdqu %ymm3,256(%rdi)
+ vmovdqu %ymm4,288(%rdi)
+ vmovdqu %ymm5,320(%rdi)
+ vmovdqu %ymm6,352(%rdi)
+ vmovdqu %ymm7,384(%rdi)
+ vmovdqu %ymm8,416(%rdi)
+ vmovdqu %ymm9,448(%rdi)
+ vmovdqu %ymm10,480(%rdi)
+
+ vzeroupper
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ movq 0(%rax),%r15
+.cfi_restore %r15
+ movq 8(%rax),%r14
+.cfi_restore %r14
+ movq 16(%rax),%r13
+.cfi_restore %r13
+ movq 24(%rax),%r12
+.cfi_restore %r12
+ movq 32(%rax),%rbp
+.cfi_restore %rbp
+ movq 40(%rax),%rbx
+.cfi_restore %rbx
+ leaq 48(%rax),%rsp
+.cfi_def_cfa %rsp,8
+.Lossl_rsaz_amm52x30_x2_avxifma256_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_rsaz_amm52x30_x2_avxifma256, .-ossl_rsaz_amm52x30_x2_avxifma256
+.text
+
+.align 32
+.globl ossl_extract_multiplier_2x30_win5_avx
+.type ossl_extract_multiplier_2x30_win5_avx,@function
+ossl_extract_multiplier_2x30_win5_avx:
+.cfi_startproc
+.byte 243,15,30,250
+ vmovapd .Lones(%rip),%ymm12
+ vmovq %rdx,%xmm8
+ vpbroadcastq %xmm8,%ymm10
+ vmovq %rcx,%xmm8
+ vpbroadcastq %xmm8,%ymm11
+ leaq 16384(%rsi),%rax
+
+
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovapd %ymm0,%ymm9
+ vmovapd %ymm0,%ymm1
+ vmovapd %ymm0,%ymm2
+ vmovapd %ymm0,%ymm3
+ vmovapd %ymm0,%ymm4
+ vmovapd %ymm0,%ymm5
+ vmovapd %ymm0,%ymm6
+ vmovapd %ymm0,%ymm7
+
+.align 32
+.Lloop:
+ vpcmpeqq %ymm9,%ymm10,%ymm13
+ vmovdqu 0(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm0,%ymm0
+ vmovdqu 32(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm1,%ymm1
+ vmovdqu 64(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm2,%ymm2
+ vmovdqu 96(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm3,%ymm3
+ vmovdqu 128(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm4,%ymm4
+ vmovdqu 160(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm5,%ymm5
+ vmovdqu 192(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm6,%ymm6
+ vmovdqu 224(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm7,%ymm7
+ vpaddq %ymm12,%ymm9,%ymm9
+ addq $512,%rsi
+ cmpq %rsi,%rax
+ jne .Lloop
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm1,32(%rdi)
+ vmovdqu %ymm2,64(%rdi)
+ vmovdqu %ymm3,96(%rdi)
+ vmovdqu %ymm4,128(%rdi)
+ vmovdqu %ymm5,160(%rdi)
+ vmovdqu %ymm6,192(%rdi)
+ vmovdqu %ymm7,224(%rdi)
+ leaq -16384(%rax),%rsi
+
+
+ vpxor %xmm0,%xmm0,%xmm0
+ vmovapd %ymm0,%ymm9
+ vmovapd %ymm0,%ymm0
+ vmovapd %ymm0,%ymm1
+ vmovapd %ymm0,%ymm2
+ vmovapd %ymm0,%ymm3
+ vmovapd %ymm0,%ymm4
+ vmovapd %ymm0,%ymm5
+ vmovapd %ymm0,%ymm6
+ vmovapd %ymm0,%ymm7
+
+.align 32
+.Lloop_8_15:
+ vpcmpeqq %ymm9,%ymm11,%ymm13
+ vmovdqu 256(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm0,%ymm0
+ vmovdqu 288(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm1,%ymm1
+ vmovdqu 320(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm2,%ymm2
+ vmovdqu 352(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm3,%ymm3
+ vmovdqu 384(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm4,%ymm4
+ vmovdqu 416(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm5,%ymm5
+ vmovdqu 448(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm6,%ymm6
+ vmovdqu 480(%rsi),%ymm8
+
+ vblendvpd %ymm13,%ymm8,%ymm7,%ymm7
+ vpaddq %ymm12,%ymm9,%ymm9
+ addq $512,%rsi
+ cmpq %rsi,%rax
+ jne .Lloop_8_15
+ vmovdqu %ymm0,256(%rdi)
+ vmovdqu %ymm1,288(%rdi)
+ vmovdqu %ymm2,320(%rdi)
+ vmovdqu %ymm3,352(%rdi)
+ vmovdqu %ymm4,384(%rdi)
+ vmovdqu %ymm5,416(%rdi)
+ vmovdqu %ymm6,448(%rdi)
+ vmovdqu %ymm7,480(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ossl_extract_multiplier_2x30_win5_avx, .-ossl_extract_multiplier_2x30_win5_avx
+.section .rodata
+.align 32
+.Lones:
+.quad 1,1,1,1
+.Lzeros:
+.quad 0,0,0,0
+ .section ".note.gnu.property", "a"
+ .p2align 3
+ .long 1f - 0f
+ .long 4f - 1f
+ .long 5
+0:
+ # "GNU" encoded with .byte, since .asciz isn't supported
+ # on Solaris.
+ .byte 0x47
+ .byte 0x4e
+ .byte 0x55
+ .byte 0
+1:
+ .p2align 3
+ .long 0xc0000002
+ .long 3f - 2f
+2:
+ .long 3
+3:
+ .p2align 3
+4: