diff options
Diffstat (limited to 'crypto/bn/asm')
48 files changed, 5743 insertions, 7875 deletions
diff --git a/crypto/bn/asm/README b/crypto/bn/asm/README deleted file mode 100644 index b0f3a68a06ab..000000000000 --- a/crypto/bn/asm/README +++ /dev/null @@ -1,27 +0,0 @@ -<OBSOLETE> - -All assember in this directory are just version of the file -crypto/bn/bn_asm.c. - -Quite a few of these files are just the assember output from gcc since on -quite a few machines they are 2 times faster than the system compiler. - -For the x86, I have hand written assember because of the bad job all -compilers seem to do on it. This normally gives a 2 time speed up in the RSA -routines. - -For the DEC alpha, I also hand wrote the assember (except the division which -is just the output from the C compiler pasted on the end of the file). -On the 2 alpha C compilers I had access to, it was not possible to do -64b x 64b -> 128b calculations (both long and the long long data types -were 64 bits). So the hand assember gives access to the 128 bit result and -a 2 times speedup :-). - -There are 3 versions of assember for the HP PA-RISC. - -pa-risc.s is the origional one which works fine and generated using gcc :-) - -pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations -by Chris Ruemmler from HP (with some help from the HP C compiler). - -</OBSOLETE> diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index 72381a77240c..7a0cdb2e8a00 100755 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -29,17 +36,34 @@ # # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software # Polynomial Multiplication on ARM Processors using the NEON Engine. -# +# # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif ___ ################ # private interface to mul_1x1_ialu @@ -120,11 +144,17 @@ mul_1x1_ialu: eor $hi,$hi,$t0,lsr#8 ldr $t0,[sp,$i0] @ tab[b >> 30 ] +#ifdef __thumb2__ + itt ne +#endif eorne $lo,$lo,$b,lsl#30 eorne $hi,$hi,$b,lsr#2 tst $a,#1<<31 eor $lo,$lo,$t1,lsl#27 eor $hi,$hi,$t1,lsr#5 +#ifdef __thumb2__ + itt ne +#endif eorne $lo,$lo,$b,lsl#31 eorne $hi,$hi,$b,lsr#1 eor $lo,$lo,$t0,lsl#30 @@ -144,20 +174,33 @@ $code.=<<___; .align 5 bn_GF2m_mul_2x2: #if __ARM_MAX_ARCH__>=7 + stmdb sp!,{r10,lr} ldr r12,.LOPENSSL_armcap -.Lpic: ldr r12,[pc,r12] - tst r12,#1 + adr r10,.LOPENSSL_armcap + ldr r12,[r12,r10] +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + itt ne + ldrne r10,[sp],#8 bne .LNEON + stmdb sp!,{r4-r9} +#else + stmdb sp!,{r4-r10,lr} #endif ___ $ret="r10"; # reassigned 1st argument $code.=<<___; - stmdb sp!,{r4-r10,lr} mov $ret,r0 @ reassign 1st argument mov $b,r3 @ $b=b1 + sub r7,sp,#36 + mov r8,sp + and r7,r7,#-32 ldr r3,[sp,#32] @ load b0 mov $mask,#7<<2 - sub sp,sp,#32 @ allocate tab[8] + mov sp,r7 @ allocate tab[8] + str r8,[r7,#32] bl mul_1x1_ialu @ a1·b1 str $lo,[$ret,#8] @@ -181,6 +224,7 @@ ___ $code.=<<___; ldmia $ret,{@r[0]-@r[3]} eor $lo,$lo,$hi + ldr sp,[sp,#32] @ destroy tab[8] eor $hi,$hi,@r[1] eor $lo,$lo,@r[0] eor $hi,$hi,@r[2] @@ -188,7 +232,6 @@ $code.=<<___; eor $hi,$hi,@r[3] str $hi,[$ret,#8] eor $lo,$lo,$hi - add sp,sp,#32 @ destroy tab[8] str $lo,[$ret,#4] #if __ARM_ARCH__>=5 @@ -213,8 +256,8 @@ $code.=<<___; .align 5 .LNEON: ldr r12, [sp] @ 5th argument - vmov.32 $a, r2, r1 - vmov.32 $b, r12, r3 + vmov $a, r2, r1 + vmov $b, r12, r3 vmov.i64 $k48, #0x0000ffffffffffff vmov.i64 $k32, #0x00000000ffffffff vmov.i64 $k16, #0x000000000000ffff @@ -267,7 +310,7 @@ $code.=<<___; #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-(.Lpic+8) +.word OPENSSL_armcap_P-. #endif .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" .align 5 diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 1d330e9f8aa3..6bedc62ba62d 100755 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -16,7 +23,7 @@ # [depending on key length, less for longer keys] on ARM920T, and # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code # base and compiler generated code with in-lined umull and even umlal -# instructions. The latter means that this code didn't really have an +# instructions. The latter means that this code didn't really have an # "advantage" of utilizing some "secret" instruction. # # The code is interoperable with Thumb ISA and is rather compact, less @@ -38,8 +45,29 @@ # for execution on all NEON-capable processors, because gain on # others outweighs the marginal loss on Cortex-A9. -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +# September 2015 +# +# Align Cortex-A9 performance with November 2013 improvements, i.e. +# NEON code is now ~20-105% faster than integer-only one on this +# processor. But this optimization further improved performance even +# on other processors: NEON code path is ~45-180% faster than original +# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on +# Snapdragon S4. + +$flavour = shift; +if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } +else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; @@ -70,12 +98,17 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-bn_mul_mont +.word OPENSSL_armcap_P-.Lbn_mul_mont #endif .global bn_mul_mont @@ -83,15 +116,19 @@ $code=<<___; .align 5 bn_mul_mont: +.Lbn_mul_mont: ldr ip,[sp,#4] @ load num stmdb sp!,{r0,r2} @ sp points at argument block #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,bn_mul_mont + adr r0,.Lbn_mul_mont ldr r2,.LOPENSSL_armcap ldr r0,[r0,r2] - tst r0,#1 @ NEON available? +#ifdef __APPLE__ + ldr r0,[r0] +#endif + tst r0,#ARMV7_NEON @ NEON available? ldmia sp, {r0,r2} beq .Lialu add sp,sp,#8 @@ -101,6 +138,9 @@ bn_mul_mont: #endif cmp ip,#2 mov $num,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt @@ -148,10 +188,11 @@ bn_mul_mont: ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= + mov $tj,sp str $nhi,[$num,#4] @ tp[num]= .Louter: - sub $tj,$num,sp @ "original" $num-1 value + sub $tj,$num,$tj @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] ldr $bi,[$tp,#4]! @ *(++bp) sub $np,$np,$tj @ "rewind" np to &np[1] @@ -196,11 +237,16 @@ bn_mul_mont: str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj +#ifdef __thumb2__ + itt ne +#endif + movne $tj,sp bne .Louter ldr $rp,[$_rp] @ pull rp + mov $aj,sp add $num,$num,#4 @ $num to point at &tp[num] - sub $aj,$num,sp @ "original" num value + sub $aj,$num,$aj @ "original" num value mov $tp,sp @ "rewind" $tp mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] @@ -216,17 +262,19 @@ bn_mul_mont: mov $tp,sp @ "rewind" $tp sub $rp,$rp,$aj @ "rewind" $rp - and $ap,$tp,$nhi - bic $np,$rp,$nhi - orr $ap,$ap,$np @ ap=borrow?tp:rp - -.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh +.Lcopy: ldr $tj,[$tp] @ conditional copy + ldr $aj,[$rp] str sp,[$tp],#4 @ zap tp - str $tj,[$rp],#4 - cmp $tp,$num +#ifdef __thumb2__ + it cc +#endif + movcc $aj,$tj + str $aj,[$rp],#4 + teq $tp,$num @ preserve carry bne .Lcopy - add sp,$num,#4 @ skip over tp[num+1] + mov sp,$num + add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 @@ -241,19 +289,16 @@ bn_mul_mont: .size bn_mul_mont,.-bn_mul_mont ___ { -sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } -sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } - my ($A0,$A1,$A2,$A3)=map("d$_",(0..3)); my ($N0,$N1,$N2,$N3)=map("d$_",(4..7)); my ($Z,$Temp)=("q4","q5"); -my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13)); +my @ACC=map("q$_",(6..13)); my ($Bi,$Ni,$M0)=map("d$_",(28..31)); -my $zero=&Dlo($Z); -my $temp=&Dlo($Temp); +my $zero="$Z#lo"; +my $temp="$Temp#lo"; my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5)); -my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9)); +my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 @@ -267,60 +312,60 @@ bn_mul8x_mont_neon: stmdb sp!,{r4-r11} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load rest of parameter block + mov ip,sp + + cmp $num,#8 + bhi .LNEON_8n + + @ special case for $num==8, everything is in register bank... - sub $toutptr,sp,#16 vld1.32 {${Bi}[0]}, [$bptr,:32]! - sub $toutptr,$toutptr,$num,lsl#4 + veor $zero,$zero,$zero + sub $toutptr,sp,$num,lsl#4 vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-( and $toutptr,$toutptr,#-64 vld1.32 {${M0}[0]}, [$n0,:32] mov sp,$toutptr @ alloca - veor $zero,$zero,$zero - subs $inner,$num,#8 vzip.16 $Bi,$zero - vmull.u32 $A0xB,$Bi,${A0}[0] - vmull.u32 $A1xB,$Bi,${A0}[1] - vmull.u32 $A2xB,$Bi,${A1}[0] - vshl.i64 $temp,`&Dhi("$A0xB")`,#16 - vmull.u32 $A3xB,$Bi,${A1}[1] + vmull.u32 @ACC[0],$Bi,${A0}[0] + vmull.u32 @ACC[1],$Bi,${A0}[1] + vmull.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmull.u32 @ACC[3],$Bi,${A1}[1] - vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero - vmul.u32 $Ni,$temp,$M0 + vmul.u32 $Ni,$Ni,$M0 - vmull.u32 $A4xB,$Bi,${A2}[0] + vmull.u32 @ACC[4],$Bi,${A2}[0] vld1.32 {$N0-$N3}, [$nptr]! - vmull.u32 $A5xB,$Bi,${A2}[1] - vmull.u32 $A6xB,$Bi,${A3}[0] + vmull.u32 @ACC[5],$Bi,${A2}[1] + vmull.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero - vmull.u32 $A7xB,$Bi,${A3}[1] - - bne .LNEON_1st - - @ special case for num=8, everything is in register bank... + vmull.u32 @ACC[7],$Bi,${A3}[1] - vmlal.u32 $A0xB,$Ni,${N0}[0] + vmlal.u32 @ACC[0],$Ni,${N0}[0] sub $outer,$num,#1 - vmlal.u32 $A1xB,$Ni,${N0}[1] - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vmov $Temp,$A0xB - vmlal.u32 $A5xB,$Ni,${N2}[1] - vmov $A0xB,$A1xB - vmlal.u32 $A6xB,$Ni,${N3}[0] - vmov $A1xB,$A2xB - vmlal.u32 $A7xB,$Ni,${N3}[1] - vmov $A2xB,$A3xB - vmov $A3xB,$A4xB + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmov $Temp,@ACC[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmov @ACC[0],@ACC[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmov @ACC[1],@ACC[2] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vmov @ACC[2],@ACC[3] + vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 - vmov $A4xB,$A5xB - vmov $A5xB,$A6xB - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - vmov $A6xB,$A7xB - veor $A7xB,$A7xB + vmov @ACC[4],@ACC[5] + vmov @ACC[5],@ACC[6] + vadd.u64 $temp,$temp,$Temp#hi + vmov @ACC[6],@ACC[7] + veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 b .LNEON_outer8 @@ -330,279 +375,302 @@ bn_mul8x_mont_neon: vld1.32 {${Bi}[0]}, [$bptr,:32]! veor $zero,$zero,$zero vzip.16 $Bi,$zero - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp - vmlal.u32 $A0xB,$Bi,${A0}[0] - vmlal.u32 $A1xB,$Bi,${A0}[1] - vmlal.u32 $A2xB,$Bi,${A1}[0] - vshl.i64 $temp,`&Dhi("$A0xB")`,#16 - vmlal.u32 $A3xB,$Bi,${A1}[1] + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] - vadd.u64 $temp,$temp,`&Dlo("$A0xB")` + vadd.u64 $Ni,$Ni,@ACC[0]#lo veor $zero,$zero,$zero subs $outer,$outer,#1 - vmul.u32 $Ni,$temp,$M0 + vmul.u32 $Ni,$Ni,$M0 - vmlal.u32 $A4xB,$Bi,${A2}[0] - vmlal.u32 $A5xB,$Bi,${A2}[1] - vmlal.u32 $A6xB,$Bi,${A3}[0] + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] vzip.16 $Ni,$zero - vmlal.u32 $A7xB,$Bi,${A3}[1] - - vmlal.u32 $A0xB,$Ni,${N0}[0] - vmlal.u32 $A1xB,$Ni,${N0}[1] - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vmov $Temp,$A0xB - vmlal.u32 $A5xB,$Ni,${N2}[1] - vmov $A0xB,$A1xB - vmlal.u32 $A6xB,$Ni,${N3}[0] - vmov $A1xB,$A2xB - vmlal.u32 $A7xB,$Ni,${N3}[1] - vmov $A2xB,$A3xB - vmov $A3xB,$A4xB + vmlal.u32 @ACC[7],$Bi,${A3}[1] + + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmov $Temp,@ACC[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmov @ACC[0],@ACC[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmov @ACC[1],@ACC[2] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vmov @ACC[2],@ACC[3] + vmov @ACC[3],@ACC[4] vshr.u64 $temp,$temp,#16 - vmov $A4xB,$A5xB - vmov $A5xB,$A6xB - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - vmov $A6xB,$A7xB - veor $A7xB,$A7xB + vmov @ACC[4],@ACC[5] + vmov @ACC[5],@ACC[6] + vadd.u64 $temp,$temp,$Temp#hi + vmov @ACC[6],@ACC[7] + veor @ACC[7],@ACC[7] vshr.u64 $temp,$temp,#16 bne .LNEON_outer8 - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp mov $toutptr,sp - vshr.u64 $temp,`&Dlo("$A0xB")`,#16 + vshr.u64 $temp,@ACC[0]#lo,#16 mov $inner,$num - vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp - add $tinptr,sp,#16 - vshr.u64 $temp,`&Dhi("$A0xB")`,#16 - vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + add $tinptr,sp,#96 + vshr.u64 $temp,@ACC[0]#hi,#16 + vzip.16 @ACC[0]#lo,@ACC[0]#hi - b .LNEON_tail2 + b .LNEON_tail_entry .align 4 -.LNEON_1st: - vmlal.u32 $A0xB,$Ni,${N0}[0] - vld1.32 {$A0-$A3}, [$aptr]! - vmlal.u32 $A1xB,$Ni,${N0}[1] +.LNEON_8n: + veor @ACC[0],@ACC[0],@ACC[0] + sub $toutptr,sp,#128 + veor @ACC[1],@ACC[1],@ACC[1] + sub $toutptr,$toutptr,$num,lsl#4 + veor @ACC[2],@ACC[2],@ACC[2] + and $toutptr,$toutptr,#-64 + veor @ACC[3],@ACC[3],@ACC[3] + mov sp,$toutptr @ alloca + veor @ACC[4],@ACC[4],@ACC[4] + add $toutptr,$toutptr,#256 + veor @ACC[5],@ACC[5],@ACC[5] + sub $inner,$num,#8 + veor @ACC[6],@ACC[6],@ACC[6] + veor @ACC[7],@ACC[7],@ACC[7] + +.LNEON_8n_init: + vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! subs $inner,$inner,#8 - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vld1.32 {$N0-$N1}, [$nptr]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vmlal.u32 $A7xB,$Ni,${N3}[1] - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - - vmull.u32 $A0xB,$Bi,${A0}[0] - vld1.32 {$N2-$N3}, [$nptr]! - vmull.u32 $A1xB,$Bi,${A0}[1] - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - vmull.u32 $A2xB,$Bi,${A1}[0] - vmull.u32 $A3xB,$Bi,${A1}[1] - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - - vmull.u32 $A4xB,$Bi,${A2}[0] - vmull.u32 $A5xB,$Bi,${A2}[1] - vmull.u32 $A6xB,$Bi,${A3}[0] - vmull.u32 $A7xB,$Bi,${A3}[1] - - bne .LNEON_1st - - vmlal.u32 $A0xB,$Ni,${N0}[0] - add $tinptr,sp,#16 - vmlal.u32 $A1xB,$Ni,${N0}[1] - sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr - vmlal.u32 $A2xB,$Ni,${N1}[0] - vld1.64 {$Temp}, [sp,:128] - vmlal.u32 $A3xB,$Ni,${N1}[1] - sub $outer,$num,#1 - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vshr.u64 $temp,$temp,#16 - vld1.64 {$A0xB}, [$tinptr, :128]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - vmlal.u32 $A7xB,$Ni,${N3}[1] - - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - veor $Z,$Z,$Z - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vst1.64 {$Z}, [$toutptr,:128] - vshr.u64 $temp,$temp,#16 - - b .LNEON_outer + vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! + vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! + vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]! + bne .LNEON_8n_init + + add $tinptr,sp,#256 + vld1.32 {$A0-$A3},[$aptr]! + add $bnptr,sp,#8 + vld1.32 {${M0}[0]},[$n0,:32] + mov $outer,$num + b .LNEON_8n_outer .align 4 -.LNEON_outer: - vld1.32 {${Bi}[0]}, [$bptr,:32]! - sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr - vld1.32 {$A0-$A3}, [$aptr]! +.LNEON_8n_outer: + vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ veor $zero,$zero,$zero - mov $toutptr,sp vzip.16 $Bi,$zero + add $toutptr,sp,#128 + vld1.32 {$N0-$N3},[$nptr]! + + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + veor $zero,$zero,$zero + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vadd.u64 $Ni,$Ni,@ACC[0]#lo + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmul.u32 $Ni,$Ni,$M0 + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +for ($i=0; $i<7;) { +$code.=<<___; + vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++ + vmlal.u32 @ACC[0],$Ni,${N0}[0] + veor $temp,$temp,$temp + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vzip.16 $Bi,$temp + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo + vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i] +___ + push(@ACC,shift(@ACC)); $i++; +$code.=<<___; + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128]! + vmlal.u32 @ACC[1],$Bi,${A0}[1] + veor $zero,$zero,$zero + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vshl.i64 $Ni,@ACC[0]#hi,#16 + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vadd.u64 $Ni,$Ni,@ACC[0]#lo + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmul.u32 $Ni,$Ni,$M0 + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vzip.16 $Ni,$zero + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vld1.32 {$A0-$A3},[$aptr]! + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16 + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo + vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i] + add $bnptr,sp,#8 @ rewind +___ + push(@ACC,shift(@ACC)); +$code.=<<___; sub $inner,$num,#8 - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp - - vmlal.u32 $A0xB,$Bi,${A0}[0] - vld1.64 {$A3xB-$A4xB},[$tinptr,:256]! - vmlal.u32 $A1xB,$Bi,${A0}[1] - vmlal.u32 $A2xB,$Bi,${A1}[0] - vld1.64 {$A5xB-$A6xB},[$tinptr,:256]! - vmlal.u32 $A3xB,$Bi,${A1}[1] - - vshl.i64 $temp,`&Dhi("$A0xB")`,#16 - veor $zero,$zero,$zero - vadd.u64 $temp,$temp,`&Dlo("$A0xB")` - vld1.64 {$A7xB},[$tinptr,:128]! - vmul.u32 $Ni,$temp,$M0 - - vmlal.u32 $A4xB,$Bi,${A2}[0] - vld1.32 {$N0-$N3}, [$nptr]! - vmlal.u32 $A5xB,$Bi,${A2}[1] - vmlal.u32 $A6xB,$Bi,${A3}[0] - vzip.16 $Ni,$zero - vmlal.u32 $A7xB,$Bi,${A3}[1] - -.LNEON_inner: - vmlal.u32 $A0xB,$Ni,${N0}[0] - vld1.32 {$A0-$A3}, [$aptr]! - vmlal.u32 $A1xB,$Ni,${N0}[1] - subs $inner,$inner,#8 - vmlal.u32 $A2xB,$Ni,${N1}[0] - vmlal.u32 $A3xB,$Ni,${N1}[1] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - - vmlal.u32 $A4xB,$Ni,${N2}[0] - vld1.64 {$A0xB}, [$tinptr, :128]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vmlal.u32 $A7xB,$Ni,${N3}[1] - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - - vmlal.u32 $A0xB,$Bi,${A0}[0] - vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! - vmlal.u32 $A1xB,$Bi,${A0}[1] - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - vmlal.u32 $A2xB,$Bi,${A1}[0] - vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! - vmlal.u32 $A3xB,$Bi,${A1}[1] - vld1.32 {$N0-$N3}, [$nptr]! - - vmlal.u32 $A4xB,$Bi,${A2}[0] - vld1.64 {$A7xB}, [$tinptr, :128]! - vmlal.u32 $A5xB,$Bi,${A2}[1] - vmlal.u32 $A6xB,$Bi,${A3}[0] - vmlal.u32 $A7xB,$Bi,${A3}[1] - - bne .LNEON_inner - - vmlal.u32 $A0xB,$Ni,${N0}[0] - add $tinptr,sp,#16 - vmlal.u32 $A1xB,$Ni,${N0}[1] - sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr - vmlal.u32 $A2xB,$Ni,${N1}[0] - vld1.64 {$Temp}, [sp,:128] - vmlal.u32 $A3xB,$Ni,${N1}[1] - subs $outer,$outer,#1 + b .LNEON_8n_inner - vmlal.u32 $A4xB,$Ni,${N2}[0] - vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]! - vmlal.u32 $A5xB,$Ni,${N2}[1] - vld1.64 {$A0xB}, [$tinptr, :128]! - vshr.u64 $temp,$temp,#16 - vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]! - vmlal.u32 $A6xB,$Ni,${N3}[0] - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vmlal.u32 $A7xB,$Ni,${N3}[1] - - vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]! - vadd.u64 $temp,$temp,`&Dhi("$Temp")` - vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]! - vshr.u64 $temp,$temp,#16 - - bne .LNEON_outer +.align 4 +.LNEON_8n_inner: + subs $inner,$inner,#8 + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + vld1.32 {$N0-$N3},[$nptr]! + vmlal.u32 @ACC[3],$Bi,${A1}[1] + it ne + addne $tinptr,$tinptr,#16 @ don't advance in last iteration + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i] + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vmlal.u32 @ACC[2],$Ni,${N1}[0] + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vmlal.u32 @ACC[7],$Ni,${N3}[1] + vst1.64 {@ACC[0]},[$toutptr,:128]! +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + vmlal.u32 @ACC[0],$Bi,${A0}[0] + vld1.64 {@ACC[7]},[$tinptr,:128] + vmlal.u32 @ACC[1],$Bi,${A0}[1] + vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i] + vmlal.u32 @ACC[2],$Bi,${A1}[0] + it ne + addne $tinptr,$tinptr,#16 @ don't advance in last iteration + vmlal.u32 @ACC[3],$Bi,${A1}[1] + vmlal.u32 @ACC[4],$Bi,${A2}[0] + vmlal.u32 @ACC[5],$Bi,${A2}[1] + vmlal.u32 @ACC[6],$Bi,${A3}[0] + vmlal.u32 @ACC[7],$Bi,${A3}[1] +___ +} +$code.=<<___; + it eq + subeq $aptr,$aptr,$num,lsl#2 @ rewind + vmlal.u32 @ACC[0],$Ni,${N0}[0] + vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 @ACC[1],$Ni,${N0}[1] + vld1.32 {$A0-$A3},[$aptr]! + vmlal.u32 @ACC[2],$Ni,${N1}[0] + add $bnptr,sp,#8 @ rewind + vmlal.u32 @ACC[3],$Ni,${N1}[1] + vmlal.u32 @ACC[4],$Ni,${N2}[0] + vmlal.u32 @ACC[5],$Ni,${N2}[1] + vmlal.u32 @ACC[6],$Ni,${N3}[0] + vst1.64 {@ACC[0]},[$toutptr,:128]! + vmlal.u32 @ACC[7],$Ni,${N3}[1] + + bne .LNEON_8n_inner +___ + push(@ACC,shift(@ACC)); +$code.=<<___; + add $tinptr,sp,#128 + vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]! + veor q2,q2,q2 @ $N0-$N1 + vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]! + veor q3,q3,q3 @ $N2-$N3 + vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]! + vst1.64 {@ACC[6]},[$toutptr,:128] + + subs $outer,$outer,#8 + vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]! + vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]! + vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]! + vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]! + + itt ne + subne $nptr,$nptr,$num,lsl#2 @ rewind + bne .LNEON_8n_outer + + add $toutptr,sp,#128 + vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 $temp,@ACC[0]#lo,#16 + vst1.64 {q2-q3},[sp,:256]! + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + vst1.64 {q2-q3}, [sp,:256]! + vshr.u64 $temp,@ACC[0]#hi,#16 + vst1.64 {q2-q3}, [sp,:256]! + vzip.16 @ACC[0]#lo,@ACC[0]#hi - mov $toutptr,sp mov $inner,$num + b .LNEON_tail_entry +.align 4 .LNEON_tail: - vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp - vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]! - vshr.u64 $temp,`&Dlo("$A0xB")`,#16 - vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp - vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]! - vshr.u64 $temp,`&Dhi("$A0xB")`,#16 - vld1.64 {$A7xB}, [$tinptr, :128]! - vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")` - -.LNEON_tail2: - vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp - vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A1xB")`,#16 - vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp - vshr.u64 $temp,`&Dhi("$A1xB")`,#16 - vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")` - - vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp - vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A2xB")`,#16 - vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp - vshr.u64 $temp,`&Dhi("$A2xB")`,#16 - vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")` - - vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp - vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A3xB")`,#16 - vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp - vshr.u64 $temp,`&Dhi("$A3xB")`,#16 - vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")` - - vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp - vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A4xB")`,#16 - vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp - vshr.u64 $temp,`&Dhi("$A4xB")`,#16 - vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")` - - vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp - vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A5xB")`,#16 - vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp - vshr.u64 $temp,`&Dhi("$A5xB")`,#16 - vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")` - - vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp - vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A6xB")`,#16 - vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp - vld1.64 {$A0xB}, [$tinptr, :128]! - vshr.u64 $temp,`&Dhi("$A6xB")`,#16 - vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")` - - vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp - vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]! - vshr.u64 $temp,`&Dlo("$A7xB")`,#16 - vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp - vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]! - vshr.u64 $temp,`&Dhi("$A7xB")`,#16 - vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")` + vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp + vshr.u64 $temp,@ACC[0]#lo,#16 + vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]! + vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp + vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]! + vshr.u64 $temp,@ACC[0]#hi,#16 + vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]! + vzip.16 @ACC[0]#lo,@ACC[0]#hi + +.LNEON_tail_entry: +___ +for ($i=1; $i<8; $i++) { +$code.=<<___; + vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp + vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]! + vshr.u64 $temp,@ACC[1]#lo,#16 + vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp + vshr.u64 $temp,@ACC[1]#hi,#16 + vzip.16 @ACC[1]#lo,@ACC[1]#hi +___ + push(@ACC,shift(@ACC)); +} + push(@ACC,shift(@ACC)); +$code.=<<___; + vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]! subs $inner,$inner,#8 - vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]! - + vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]! bne .LNEON_tail vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit @@ -622,8 +690,9 @@ bn_mul8x_mont_neon: bne .LNEON_sub ldr r10, [$aptr] @ load top-most bit + mov r11,sp veor q0,q0,q0 - sub r11,$bptr,sp @ this is num*4 + sub r11,$bptr,r11 @ this is num*4 veor q1,q1,q1 mov $aptr,sp sub $rptr,$rptr,r11 @ rewind $rptr @@ -633,27 +702,33 @@ bn_mul8x_mont_neon: .LNEON_copy_n_zap: ldmia $aptr!, {r4-r7} ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 ldmia $aptr, {r4-r7} stmia $rptr!, {r8-r11} sub $aptr,$aptr,#16 ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 teq $aptr,$bptr @ preserves carry stmia $rptr!, {r8-r11} bne .LNEON_copy_n_zap - sub sp,ip,#96 + mov sp,ip vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} ret @ bx lr @@ -669,8 +744,14 @@ $code.=<<___; #endif ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 -$code =~ s/\bret\b/bx lr/gm; -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or + s/\bret\b/bx lr/g or + s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + close STDOUT; diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl new file mode 100755 index 000000000000..5d5af1b6be25 --- /dev/null +++ b/crypto/bn/asm/armv8-mont.pl @@ -0,0 +1,1510 @@ +#! /usr/bin/env perl +# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# March 2015 +# +# "Teaser" Montgomery multiplication module for ARMv8. Needs more +# work. While it does improve RSA sign performance by 20-30% (less for +# longer keys) on most processors, for some reason RSA2048 is not +# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication +# instruction issue rate is limited on processor in question, meaning +# that dedicated squaring procedure is a must. Well, actually all +# contemporary AArch64 processors seem to have limited multiplication +# issue rate, i.e. they can't issue multiplication every cycle, which +# explains moderate improvement coefficients in comparison to +# compiler-generated code. Recall that compiler is instructed to use +# umulh and therefore uses same amount of multiplication instructions +# to do the job. Assembly's edge is to minimize number of "collateral" +# instructions and of course instruction scheduling. +# +# April 2015 +# +# Squaring procedure that handles lengths divisible by 8 improves +# RSA/DSA performance by 25-40-60% depending on processor and key +# length. Overall improvement coefficients are always positive in +# comparison to compiler-generated code. On Cortex-A57 improvement +# is still modest on longest key lengths, while others exhibit e.g. +# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster +# on Cortex-A57 and ~60-100% faster on others. + +$flavour = shift; +$output = shift; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +($lo0,$hi0,$aj,$m0,$alo,$ahi, + $lo1,$hi1,$nj,$m1,$nlo,$nhi, + $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); + +# int bn_mul_mont( +$rp="x0"; # BN_ULONG *rp, +$ap="x1"; # const BN_ULONG *ap, +$bp="x2"; # const BN_ULONG *bp, +$np="x3"; # const BN_ULONG *np, +$n0="x4"; # const BN_ULONG *n0, +$num="x5"; # int num); + +$code.=<<___; +.text + +.globl bn_mul_mont +.type bn_mul_mont,%function +.align 5 +bn_mul_mont: + tst $num,#7 + b.eq __bn_sqr8x_mont + tst $num,#3 + b.eq __bn_mul4x_mont +.Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr $m0,[$bp],#8 // bp[0] + sub $tp,sp,$num,lsl#3 + ldp $hi0,$aj,[$ap],#16 // ap[0..1] + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + and $tp,$tp,#-16 // ABI says so + ldp $hi1,$nj,[$np],#16 // np[0..1] + + mul $lo0,$hi0,$m0 // ap[0]*bp[0] + sub $j,$num,#16 // j=num-2 + umulh $hi0,$hi0,$m0 + mul $alo,$aj,$m0 // ap[1]*bp[0] + umulh $ahi,$aj,$m0 + + mul $m1,$lo0,$n0 // "tp[0]"*n0 + mov sp,$tp // alloca + + // (*) mul $lo1,$hi1,$m1 // np[0]*m1 + umulh $hi1,$hi1,$m1 + mul $nlo,$nj,$m1 // np[1]*m1 + // (*) adds $lo1,$lo1,$lo0 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // $lo0 being non-zero. So that carry can be calculated + // by adding -1 to $lo0. That's what next instruction does. + subs xzr,$lo0,#1 // (*) + umulh $nhi,$nj,$m1 + adc $hi1,$hi1,xzr + cbz $j,.L1st_skip + +.L1st: + ldr $aj,[$ap],#8 + adds $lo0,$alo,$hi0 + sub $j,$j,#8 // j-- + adc $hi0,$ahi,xzr + + ldr $nj,[$np],#8 + adds $lo1,$nlo,$hi1 + mul $alo,$aj,$m0 // ap[j]*bp[0] + adc $hi1,$nhi,xzr + umulh $ahi,$aj,$m0 + + adds $lo1,$lo1,$lo0 + mul $nlo,$nj,$m1 // np[j]*m1 + adc $hi1,$hi1,xzr + umulh $nhi,$nj,$m1 + str $lo1,[$tp],#8 // tp[j-1] + cbnz $j,.L1st + +.L1st_skip: + adds $lo0,$alo,$hi0 + sub $ap,$ap,$num // rewind $ap + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + sub $np,$np,$num // rewind $np + adc $hi1,$nhi,xzr + + adds $lo1,$lo1,$lo0 + sub $i,$num,#8 // i=num-1 + adcs $hi1,$hi1,$hi0 + + adc $ovf,xzr,xzr // upmost overflow bit + stp $lo1,$hi1,[$tp] + +.Louter: + ldr $m0,[$bp],#8 // bp[i] + ldp $hi0,$aj,[$ap],#16 + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + + mul $lo0,$hi0,$m0 // ap[0]*bp[i] + sub $j,$num,#16 // j=num-2 + umulh $hi0,$hi0,$m0 + ldp $hi1,$nj,[$np],#16 + mul $alo,$aj,$m0 // ap[1]*bp[i] + adds $lo0,$lo0,$tj + umulh $ahi,$aj,$m0 + adc $hi0,$hi0,xzr + + mul $m1,$lo0,$n0 + sub $i,$i,#8 // i-- + + // (*) mul $lo1,$hi1,$m1 // np[0]*m1 + umulh $hi1,$hi1,$m1 + mul $nlo,$nj,$m1 // np[1]*m1 + // (*) adds $lo1,$lo1,$lo0 + subs xzr,$lo0,#1 // (*) + umulh $nhi,$nj,$m1 + cbz $j,.Linner_skip + +.Linner: + ldr $aj,[$ap],#8 + adc $hi1,$hi1,xzr + ldr $tj,[$tp],#8 // tp[j] + adds $lo0,$alo,$hi0 + sub $j,$j,#8 // j-- + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + ldr $nj,[$np],#8 + adc $hi1,$nhi,xzr + + mul $alo,$aj,$m0 // ap[j]*bp[i] + adds $lo0,$lo0,$tj + umulh $ahi,$aj,$m0 + adc $hi0,$hi0,xzr + + mul $nlo,$nj,$m1 // np[j]*m1 + adds $lo1,$lo1,$lo0 + umulh $nhi,$nj,$m1 + str $lo1,[$tp,#-16] // tp[j-1] + cbnz $j,.Linner + +.Linner_skip: + ldr $tj,[$tp],#8 // tp[j] + adc $hi1,$hi1,xzr + adds $lo0,$alo,$hi0 + sub $ap,$ap,$num // rewind $ap + adc $hi0,$ahi,xzr + + adds $lo1,$nlo,$hi1 + sub $np,$np,$num // rewind $np + adcs $hi1,$nhi,$ovf + adc $ovf,xzr,xzr + + adds $lo0,$lo0,$tj + adc $hi0,$hi0,xzr + + adds $lo1,$lo1,$lo0 + adcs $hi1,$hi1,$hi0 + adc $ovf,$ovf,xzr // upmost overflow bit + stp $lo1,$hi1,[$tp,#-16] + + cbnz $i,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + ldr $nj,[$np],#8 // np[0] + subs $j,$num,#8 // j=num-1 and clear borrow + mov $ap,$rp +.Lsub: + sbcs $aj,$tj,$nj // tp[j]-np[j] + ldr $tj,[$tp],#8 + sub $j,$j,#8 // j-- + ldr $nj,[$np],#8 + str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] + cbnz $j,.Lsub + + sbcs $aj,$tj,$nj + sbcs $ovf,$ovf,xzr // did it borrow? + str $aj,[$ap],#8 // rp[num-1] + + ldr $tj,[sp] // tp[0] + add $tp,sp,#8 + ldr $aj,[$rp],#8 // rp[0] + sub $num,$num,#8 // num-- + nop +.Lcond_copy: + sub $num,$num,#8 // num-- + csel $nj,$tj,$aj,lo // did it borrow? + ldr $tj,[$tp],#8 + ldr $aj,[$rp],#8 + str xzr,[$tp,#-16] // wipe tp + str $nj,[$rp,#-16] + cbnz $num,.Lcond_copy + + csel $nj,$tj,$aj,lo + str xzr,[$tp,#-8] // wipe tp + str $nj,[$rp,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size bn_mul_mont,.-bn_mul_mont +___ +{ +######################################################################## +# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. + +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); +my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); +my ($cnt,$carry,$topmost)=("x27","x28","x30"); +my ($tp,$ap_end,$na0)=($bp,$np,$carry); + +$code.=<<___; +.type __bn_sqr8x_mont,%function +.align 5 +__bn_sqr8x_mont: + cmp $ap,$bp + b.ne __bn_mul4x_mont +.Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $rp,$np,[sp,#96] // offload rp and np + + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + ldp $a4,$a5,[$ap,#8*4] + ldp $a6,$a7,[$ap,#8*6] + + sub $tp,sp,$num,lsl#4 + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + mov sp,$tp // alloca + sub $cnt,$num,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub $cnt,$cnt,#8*8 + stp xzr,xzr,[$tp,#8*0] + stp xzr,xzr,[$tp,#8*2] + stp xzr,xzr,[$tp,#8*4] + stp xzr,xzr,[$tp,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[$tp,#8*8] + stp xzr,xzr,[$tp,#8*10] + stp xzr,xzr,[$tp,#8*12] + stp xzr,xzr,[$tp,#8*14] + add $tp,$tp,#8*16 + cbnz $cnt,.Lsqr8x_zero + + add $ap_end,$ap,$num + add $ap,$ap,#8*8 + mov $acc0,xzr + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + mov $acc4,xzr + mov $acc5,xzr + mov $acc6,xzr + mov $acc7,xzr + mov $tp,sp + str $n0,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) + mul $t1,$a2,$a0 + mul $t2,$a3,$a0 + mul $t3,$a4,$a0 + adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) + mul $t0,$a5,$a0 + adcs $acc2,$acc2,$t1 + mul $t1,$a6,$a0 + adcs $acc3,$acc3,$t2 + mul $t2,$a7,$a0 + adcs $acc4,$acc4,$t3 + umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) + adcs $acc5,$acc5,$t0 + umulh $t0,$a2,$a0 + adcs $acc6,$acc6,$t1 + umulh $t1,$a3,$a0 + adcs $acc7,$acc7,$t2 + umulh $t2,$a4,$a0 + stp $acc0,$acc1,[$tp],#8*2 // t[0..1] + adc $acc0,xzr,xzr // t[8] + adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) + umulh $t3,$a5,$a0 + adcs $acc3,$acc3,$t0 + umulh $t0,$a6,$a0 + adcs $acc4,$acc4,$t1 + umulh $t1,$a7,$a0 + adcs $acc5,$acc5,$t2 + mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) + adcs $acc6,$acc6,$t3 + mul $t3,$a3,$a1 + adcs $acc7,$acc7,$t0 + mul $t0,$a4,$a1 + adc $acc0,$acc0,$t1 + + mul $t1,$a5,$a1 + adds $acc3,$acc3,$t2 + mul $t2,$a6,$a1 + adcs $acc4,$acc4,$t3 + mul $t3,$a7,$a1 + adcs $acc5,$acc5,$t0 + umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) + adcs $acc6,$acc6,$t1 + umulh $t1,$a3,$a1 + adcs $acc7,$acc7,$t2 + umulh $t2,$a4,$a1 + adcs $acc0,$acc0,$t3 + umulh $t3,$a5,$a1 + stp $acc2,$acc3,[$tp],#8*2 // t[2..3] + adc $acc1,xzr,xzr // t[9] + adds $acc4,$acc4,$t0 + umulh $t0,$a6,$a1 + adcs $acc5,$acc5,$t1 + umulh $t1,$a7,$a1 + adcs $acc6,$acc6,$t2 + mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) + adcs $acc7,$acc7,$t3 + mul $t3,$a4,$a2 + adcs $acc0,$acc0,$t0 + mul $t0,$a5,$a2 + adc $acc1,$acc1,$t1 + + mul $t1,$a6,$a2 + adds $acc5,$acc5,$t2 + mul $t2,$a7,$a2 + adcs $acc6,$acc6,$t3 + umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) + adcs $acc7,$acc7,$t0 + umulh $t0,$a4,$a2 + adcs $acc0,$acc0,$t1 + umulh $t1,$a5,$a2 + adcs $acc1,$acc1,$t2 + umulh $t2,$a6,$a2 + stp $acc4,$acc5,[$tp],#8*2 // t[4..5] + adc $acc2,xzr,xzr // t[10] + adds $acc6,$acc6,$t3 + umulh $t3,$a7,$a2 + adcs $acc7,$acc7,$t0 + mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) + adcs $acc0,$acc0,$t1 + mul $t1,$a5,$a3 + adcs $acc1,$acc1,$t2 + mul $t2,$a6,$a3 + adc $acc2,$acc2,$t3 + + mul $t3,$a7,$a3 + adds $acc7,$acc7,$t0 + umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) + adcs $acc0,$acc0,$t1 + umulh $t1,$a5,$a3 + adcs $acc1,$acc1,$t2 + umulh $t2,$a6,$a3 + adcs $acc2,$acc2,$t3 + umulh $t3,$a7,$a3 + stp $acc6,$acc7,[$tp],#8*2 // t[6..7] + adc $acc3,xzr,xzr // t[11] + adds $acc0,$acc0,$t0 + mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) + adcs $acc1,$acc1,$t1 + mul $t1,$a6,$a4 + adcs $acc2,$acc2,$t2 + mul $t2,$a7,$a4 + adc $acc3,$acc3,$t3 + + umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) + adds $acc1,$acc1,$t0 + umulh $t0,$a6,$a4 + adcs $acc2,$acc2,$t1 + umulh $t1,$a7,$a4 + adcs $acc3,$acc3,$t2 + mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) + adc $acc4,xzr,xzr // t[12] + adds $acc2,$acc2,$t3 + mul $t3,$a7,$a5 + adcs $acc3,$acc3,$t0 + umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) + adc $acc4,$acc4,$t1 + + umulh $t1,$a7,$a5 + adds $acc3,$acc3,$t2 + mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) + adcs $acc4,$acc4,$t3 + umulh $t3,$a7,$a6 // hi(a[7]*a[6]) + adc $acc5,xzr,xzr // t[13] + adds $acc4,$acc4,$t0 + sub $cnt,$ap_end,$ap // done yet? + adc $acc5,$acc5,$t1 + + adds $acc5,$acc5,$t2 + sub $t0,$ap_end,$num // rewinded ap + adc $acc6,xzr,xzr // t[14] + add $acc6,$acc6,$t3 + + cbz $cnt,.Lsqr8x_outer_break + + mov $n0,$a0 + ldp $a0,$a1,[$tp,#8*0] + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + adds $acc0,$acc0,$a0 + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$ap,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$ap,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$ap,#8*4] + adcs $acc6,$acc6,$a6 + mov $rp,$ap + adcs $acc7,xzr,$a7 + ldp $a6,$a7,[$ap,#8*6] + add $ap,$ap,#8*8 + //adc $carry,xzr,xzr // moved below + mov $cnt,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul $t0,$a0,$n0 + adc $carry,xzr,xzr // carry bit, modulo-scheduled + mul $t1,$a1,$n0 + add $cnt,$cnt,#8 + mul $t2,$a2,$n0 + mul $t3,$a3,$n0 + adds $acc0,$acc0,$t0 + mul $t0,$a4,$n0 + adcs $acc1,$acc1,$t1 + mul $t1,$a5,$n0 + adcs $acc2,$acc2,$t2 + mul $t2,$a6,$n0 + adcs $acc3,$acc3,$t3 + mul $t3,$a7,$n0 + adcs $acc4,$acc4,$t0 + umulh $t0,$a0,$n0 + adcs $acc5,$acc5,$t1 + umulh $t1,$a1,$n0 + adcs $acc6,$acc6,$t2 + umulh $t2,$a2,$n0 + adcs $acc7,$acc7,$t3 + umulh $t3,$a3,$n0 + adc $carry,$carry,xzr + str $acc0,[$tp],#8 + adds $acc0,$acc1,$t0 + umulh $t0,$a4,$n0 + adcs $acc1,$acc2,$t1 + umulh $t1,$a5,$n0 + adcs $acc2,$acc3,$t2 + umulh $t2,$a6,$n0 + adcs $acc3,$acc4,$t3 + umulh $t3,$a7,$n0 + ldr $n0,[$rp,$cnt] + adcs $acc4,$acc5,$t0 + adcs $acc5,$acc6,$t1 + adcs $acc6,$acc7,$t2 + adcs $acc7,$carry,$t3 + //adc $carry,xzr,xzr // moved above + cbnz $cnt,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp $ap,$ap_end // done yet? + b.eq .Lsqr8x_break + + ldp $a0,$a1,[$tp,#8*0] + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + adds $acc0,$acc0,$a0 + ldr $n0,[$rp,#-8*8] + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$ap,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$ap,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$ap,#8*4] + adcs $acc6,$acc6,$a6 + mov $cnt,#-8*8 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$ap,#8*6] + add $ap,$ap,#8*8 + //adc $carry,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp $a0,$a1,[$rp,#8*0] + add $ap,$rp,#8*8 + ldp $a2,$a3,[$rp,#8*2] + sub $t0,$ap_end,$ap // is it last iteration? + ldp $a4,$a5,[$rp,#8*4] + sub $t1,$tp,$t0 + ldp $a6,$a7,[$rp,#8*6] + cbz $t0,.Lsqr8x_outer_loop + + stp $acc0,$acc1,[$tp,#8*0] + ldp $acc0,$acc1,[$t1,#8*0] + stp $acc2,$acc3,[$tp,#8*2] + ldp $acc2,$acc3,[$t1,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[$t1,#8*4] + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,$t1 + ldp $acc6,$acc7,[$t1,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] + ldp $t1,$t2,[sp,#8*1] + ldp $a5,$a7,[$t0,#8*2] + add $ap,$t0,#8*4 + ldp $t3,$t0,[sp,#8*3] + + stp $acc0,$acc1,[$tp,#8*0] + mul $acc0,$a1,$a1 + stp $acc2,$acc3,[$tp,#8*2] + umulh $a1,$a1,$a1 + stp $acc4,$acc5,[$tp,#8*4] + mul $a2,$a3,$a3 + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,sp + umulh $a3,$a3,$a3 + adds $acc1,$a1,$t1,lsl#1 + extr $t1,$t2,$t1,#63 + sub $cnt,$num,#8*4 + +.Lsqr4x_shift_n_add: + adcs $acc2,$a2,$t1 + extr $t2,$t3,$t2,#63 + sub $cnt,$cnt,#8*4 + adcs $acc3,$a3,$t2 + ldp $t1,$t2,[$tp,#8*5] + mul $a4,$a5,$a5 + ldp $a1,$a3,[$ap],#8*2 + umulh $a5,$a5,$a5 + mul $a6,$a7,$a7 + umulh $a7,$a7,$a7 + extr $t3,$t0,$t3,#63 + stp $acc0,$acc1,[$tp,#8*0] + adcs $acc4,$a4,$t3 + extr $t0,$t1,$t0,#63 + stp $acc2,$acc3,[$tp,#8*2] + adcs $acc5,$a5,$t0 + ldp $t3,$t0,[$tp,#8*7] + extr $t1,$t2,$t1,#63 + adcs $acc6,$a6,$t1 + extr $t2,$t3,$t2,#63 + adcs $acc7,$a7,$t2 + ldp $t1,$t2,[$tp,#8*9] + mul $a0,$a1,$a1 + ldp $a5,$a7,[$ap],#8*2 + umulh $a1,$a1,$a1 + mul $a2,$a3,$a3 + umulh $a3,$a3,$a3 + stp $acc4,$acc5,[$tp,#8*4] + extr $t3,$t0,$t3,#63 + stp $acc6,$acc7,[$tp,#8*6] + add $tp,$tp,#8*8 + adcs $acc0,$a0,$t3 + extr $t0,$t1,$t0,#63 + adcs $acc1,$a1,$t0 + ldp $t3,$t0,[$tp,#8*3] + extr $t1,$t2,$t1,#63 + cbnz $cnt,.Lsqr4x_shift_n_add +___ +my ($np,$np_end)=($ap,$ap_end); +$code.=<<___; + ldp $np,$n0,[x29,#104] // pull np and n0 + + adcs $acc2,$a2,$t1 + extr $t2,$t3,$t2,#63 + adcs $acc3,$a3,$t2 + ldp $t1,$t2,[$tp,#8*5] + mul $a4,$a5,$a5 + umulh $a5,$a5,$a5 + stp $acc0,$acc1,[$tp,#8*0] + mul $a6,$a7,$a7 + umulh $a7,$a7,$a7 + stp $acc2,$acc3,[$tp,#8*2] + extr $t3,$t0,$t3,#63 + adcs $acc4,$a4,$t3 + extr $t0,$t1,$t0,#63 + ldp $acc0,$acc1,[sp,#8*0] + adcs $acc5,$a5,$t0 + extr $t1,$t2,$t1,#63 + ldp $a0,$a1,[$np,#8*0] + adcs $acc6,$a6,$t1 + extr $t2,xzr,$t2,#63 + ldp $a2,$a3,[$np,#8*2] + adc $acc7,$a7,$t2 + ldp $a4,$a5,[$np,#8*4] + + // Reduce by 512 bits per iteration + mul $na0,$n0,$acc0 // t[0]*n0 + ldp $a6,$a7,[$np,#8*6] + add $np_end,$np,$num + ldp $acc2,$acc3,[sp,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[sp,#8*4] + stp $acc6,$acc7,[$tp,#8*6] + ldp $acc6,$acc7,[sp,#8*6] + add $np,$np,#8*8 + mov $topmost,xzr // initial top-most carry + mov $tp,sp + mov $cnt,#8 + +.Lsqr8x_reduction: + // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) + mul $t1,$a1,$na0 + sub $cnt,$cnt,#1 + mul $t2,$a2,$na0 + str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing + mul $t3,$a3,$na0 + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + mul $t0,$a4,$na0 + adcs $acc0,$acc1,$t1 + mul $t1,$a5,$na0 + adcs $acc1,$acc2,$t2 + mul $t2,$a6,$na0 + adcs $acc2,$acc3,$t3 + mul $t3,$a7,$na0 + adcs $acc3,$acc4,$t0 + umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) + adcs $acc4,$acc5,$t1 + umulh $t1,$a1,$na0 + adcs $acc5,$acc6,$t2 + umulh $t2,$a2,$na0 + adcs $acc6,$acc7,$t3 + umulh $t3,$a3,$na0 + adc $acc7,xzr,xzr + adds $acc0,$acc0,$t0 + umulh $t0,$a4,$na0 + adcs $acc1,$acc1,$t1 + umulh $t1,$a5,$na0 + adcs $acc2,$acc2,$t2 + umulh $t2,$a6,$na0 + adcs $acc3,$acc3,$t3 + umulh $t3,$a7,$na0 + mul $na0,$n0,$acc0 // next t[0]*n0 + adcs $acc4,$acc4,$t0 + adcs $acc5,$acc5,$t1 + adcs $acc6,$acc6,$t2 + adc $acc7,$acc7,$t3 + cbnz $cnt,.Lsqr8x_reduction + + ldp $t0,$t1,[$tp,#8*0] + ldp $t2,$t3,[$tp,#8*2] + mov $rp,$tp + sub $cnt,$np_end,$np // done yet? + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + ldp $t0,$t1,[$tp,#8*4] + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + ldp $t2,$t3,[$tp,#8*6] + adcs $acc4,$acc4,$t0 + adcs $acc5,$acc5,$t1 + adcs $acc6,$acc6,$t2 + adcs $acc7,$acc7,$t3 + //adc $carry,xzr,xzr // moved below + cbz $cnt,.Lsqr8x8_post_condition + + ldr $n0,[$tp,#-8*8] + ldp $a0,$a1,[$np,#8*0] + ldp $a2,$a3,[$np,#8*2] + ldp $a4,$a5,[$np,#8*4] + mov $cnt,#-8*8 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + +.Lsqr8x_tail: + mul $t0,$a0,$n0 + adc $carry,xzr,xzr // carry bit, modulo-scheduled + mul $t1,$a1,$n0 + add $cnt,$cnt,#8 + mul $t2,$a2,$n0 + mul $t3,$a3,$n0 + adds $acc0,$acc0,$t0 + mul $t0,$a4,$n0 + adcs $acc1,$acc1,$t1 + mul $t1,$a5,$n0 + adcs $acc2,$acc2,$t2 + mul $t2,$a6,$n0 + adcs $acc3,$acc3,$t3 + mul $t3,$a7,$n0 + adcs $acc4,$acc4,$t0 + umulh $t0,$a0,$n0 + adcs $acc5,$acc5,$t1 + umulh $t1,$a1,$n0 + adcs $acc6,$acc6,$t2 + umulh $t2,$a2,$n0 + adcs $acc7,$acc7,$t3 + umulh $t3,$a3,$n0 + adc $carry,$carry,xzr + str $acc0,[$tp],#8 + adds $acc0,$acc1,$t0 + umulh $t0,$a4,$n0 + adcs $acc1,$acc2,$t1 + umulh $t1,$a5,$n0 + adcs $acc2,$acc3,$t2 + umulh $t2,$a6,$n0 + adcs $acc3,$acc4,$t3 + umulh $t3,$a7,$n0 + ldr $n0,[$rp,$cnt] + adcs $acc4,$acc5,$t0 + adcs $acc5,$acc6,$t1 + adcs $acc6,$acc7,$t2 + adcs $acc7,$carry,$t3 + //adc $carry,xzr,xzr // moved above + cbnz $cnt,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp $a0,$a1,[$tp,#8*0] + sub $cnt,$np_end,$np // done yet? + sub $t2,$np_end,$num // rewinded np + ldp $a2,$a3,[$tp,#8*2] + ldp $a4,$a5,[$tp,#8*4] + ldp $a6,$a7,[$tp,#8*6] + cbz $cnt,.Lsqr8x_tail_break + + ldr $n0,[$rp,#-8*8] + adds $acc0,$acc0,$a0 + adcs $acc1,$acc1,$a1 + ldp $a0,$a1,[$np,#8*0] + adcs $acc2,$acc2,$a2 + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$np,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$np,#8*4] + adcs $acc6,$acc6,$a6 + mov $cnt,#-8*8 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + //adc $carry,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr $n0,[x29,#112] // pull n0 + add $cnt,$tp,#8*8 // end of current t[num] window + + subs xzr,$topmost,#1 // "move" top-most carry to carry bit + adcs $t0,$acc0,$a0 + adcs $t1,$acc1,$a1 + ldp $acc0,$acc1,[$rp,#8*0] + adcs $acc2,$acc2,$a2 + ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] + adcs $acc3,$acc3,$a3 + ldp $a2,$a3,[$t2,#8*2] + adcs $acc4,$acc4,$a4 + adcs $acc5,$acc5,$a5 + ldp $a4,$a5,[$t2,#8*4] + adcs $acc6,$acc6,$a6 + adcs $acc7,$acc7,$a7 + ldp $a6,$a7,[$t2,#8*6] + add $np,$t2,#8*8 + adc $topmost,xzr,xzr // top-most carry + mul $na0,$n0,$acc0 + stp $t0,$t1,[$tp,#8*0] + stp $acc2,$acc3,[$tp,#8*2] + ldp $acc2,$acc3,[$rp,#8*2] + stp $acc4,$acc5,[$tp,#8*4] + ldp $acc4,$acc5,[$rp,#8*4] + cmp $cnt,x29 // did we hit the bottom? + stp $acc6,$acc7,[$tp,#8*6] + mov $tp,$rp // slide the window + ldp $acc6,$acc7,[$rp,#8*6] + mov $cnt,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr $rp,[x29,#96] // pull rp + add $tp,$tp,#8*8 + subs $t0,$acc0,$a0 + sbcs $t1,$acc1,$a1 + sub $cnt,$num,#8*8 + mov $ap_end,$rp // $rp copy + +.Lsqr8x_sub: + sbcs $t2,$acc2,$a2 + ldp $a0,$a1,[$np,#8*0] + sbcs $t3,$acc3,$a3 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc4,$a4 + ldp $a2,$a3,[$np,#8*2] + sbcs $t1,$acc5,$a5 + stp $t2,$t3,[$rp,#8*2] + sbcs $t2,$acc6,$a6 + ldp $a4,$a5,[$np,#8*4] + sbcs $t3,$acc7,$a7 + ldp $a6,$a7,[$np,#8*6] + add $np,$np,#8*8 + ldp $acc0,$acc1,[$tp,#8*0] + sub $cnt,$cnt,#8*8 + ldp $acc2,$acc3,[$tp,#8*2] + ldp $acc4,$acc5,[$tp,#8*4] + ldp $acc6,$acc7,[$tp,#8*6] + add $tp,$tp,#8*8 + stp $t0,$t1,[$rp,#8*4] + sbcs $t0,$acc0,$a0 + stp $t2,$t3,[$rp,#8*6] + add $rp,$rp,#8*8 + sbcs $t1,$acc1,$a1 + cbnz $cnt,.Lsqr8x_sub + + sbcs $t2,$acc2,$a2 + mov $tp,sp + add $ap,sp,$num + ldp $a0,$a1,[$ap_end,#8*0] + sbcs $t3,$acc3,$a3 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc4,$a4 + ldp $a2,$a3,[$ap_end,#8*2] + sbcs $t1,$acc5,$a5 + stp $t2,$t3,[$rp,#8*2] + sbcs $t2,$acc6,$a6 + ldp $acc0,$acc1,[$ap,#8*0] + sbcs $t3,$acc7,$a7 + ldp $acc2,$acc3,[$ap,#8*2] + sbcs xzr,$topmost,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp $t0,$t1,[$rp,#8*4] + stp $t2,$t3,[$rp,#8*6] + + sub $cnt,$num,#8*4 +.Lsqr4x_cond_copy: + sub $cnt,$cnt,#8*4 + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + ldp $a0,$a1,[$ap_end,#8*4] + ldp $acc0,$acc1,[$ap,#8*4] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*2] + add $tp,$tp,#8*4 + csel $t3,$acc3,$a3,lo + ldp $a2,$a3,[$ap_end,#8*6] + ldp $acc2,$acc3,[$ap,#8*6] + add $ap,$ap,#8*4 + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + add $ap_end,$ap_end,#8*4 + stp xzr,xzr,[$ap,#8*0] + stp xzr,xzr,[$ap,#8*2] + cbnz $cnt,.Lsqr4x_cond_copy + + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + stp xzr,xzr,[$tp,#8*2] + csel $t2,$acc2,$a2,lo + csel $t3,$acc3,$a3,lo + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc $carry,xzr,xzr + ldr x30,[x29,#8] // pull return address + // $acc0-7,$carry hold result, $a0-7 hold modulus + subs $a0,$acc0,$a0 + ldr $ap,[x29,#96] // pull rp + sbcs $a1,$acc1,$a1 + stp xzr,xzr,[sp,#8*0] + sbcs $a2,$acc2,$a2 + stp xzr,xzr,[sp,#8*2] + sbcs $a3,$acc3,$a3 + stp xzr,xzr,[sp,#8*4] + sbcs $a4,$acc4,$a4 + stp xzr,xzr,[sp,#8*6] + sbcs $a5,$acc5,$a5 + stp xzr,xzr,[sp,#8*8] + sbcs $a6,$acc6,$a6 + stp xzr,xzr,[sp,#8*10] + sbcs $a7,$acc7,$a7 + stp xzr,xzr,[sp,#8*12] + sbcs $carry,$carry,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // $a0-7 hold result-modulus + csel $a0,$acc0,$a0,lo + csel $a1,$acc1,$a1,lo + csel $a2,$acc2,$a2,lo + csel $a3,$acc3,$a3,lo + stp $a0,$a1,[$ap,#8*0] + csel $a4,$acc4,$a4,lo + csel $a5,$acc5,$a5,lo + stp $a2,$a3,[$ap,#8*2] + csel $a6,$acc6,$a6,lo + csel $a7,$acc7,$a7,lo + stp $a4,$a5,[$ap,#8*4] + stp $a6,$a7,[$ap,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +___ +} + +{ +######################################################################## +# Even though this might look as ARMv8 adaptation of mulx4x_mont from +# x86_64-mont5 module, it's different in sense that it performs +# reduction 256 bits at a time. + +my ($a0,$a1,$a2,$a3, + $t0,$t1,$t2,$t3, + $m0,$m1,$m2,$m3, + $acc0,$acc1,$acc2,$acc3,$acc4, + $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); +my $bp_end=$rp; +my ($carry,$topmost) = ($rp,"x30"); + +$code.=<<___; +.type __bn_mul4x_mont,%function +.align 5 +__bn_mul4x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub $tp,sp,$num,lsl#3 + lsl $num,$num,#3 + ldr $n0,[$n0] // *n0 + sub sp,$tp,#8*4 // alloca + + add $t0,$bp,$num + add $ap_end,$ap,$num + stp $rp,$t0,[x29,#96] // offload rp and &b[num] + + ldr $bi,[$bp,#8*0] // b[0] + ldp $a0,$a1,[$ap,#8*0] // a[0..3] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + mov $acc0,xzr + mov $acc1,xzr + mov $acc2,xzr + mov $acc3,xzr + ldp $m0,$m1,[$np,#8*0] // n[0..3] + ldp $m2,$m3,[$np,#8*2] + adds $np,$np,#8*4 // clear carry bit + mov $carry,xzr + mov $cnt,#0 + mov $tp,sp + +.Loop_mul4x_1st_reduction: + mul $t0,$a0,$bi // lo(a[0..3]*b[0]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) + adcs $acc1,$acc1,$t1 + mul $mi,$acc0,$n0 // t[0]*n0 + adcs $acc2,$acc2,$t2 + umulh $t1,$a1,$bi + adcs $acc3,$acc3,$t3 + umulh $t2,$a2,$bi + adc $acc4,xzr,xzr + umulh $t3,$a3,$bi + ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) + adds $acc1,$acc1,$t0 + // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) + str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) + adcs $acc0,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc1,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc2,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc3,$acc4,$carry + adc $carry,xzr,xzr + adds $acc0,$acc0,$t0 + sub $t0,$ap_end,$ap + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_1st_reduction + + cbz $t0,.Lmul4x4_post_condition + + ldp $a0,$a1,[$ap,#8*0] // a[4..7] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + ldr $mi,[sp] // a[0]*n0 + ldp $m0,$m1,[$np,#8*0] // n[4..7] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + +.Loop_mul4x_1st_tail: + mul $t0,$a0,$bi // lo(a[4..7]*b[i]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,xzr,xzr + ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) + adds $acc1,$acc1,$t0 + mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + adds $acc0,$acc0,$t0 + umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) + adcs $acc1,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc2,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc3,$acc3,$t3 + adcs $acc4,$acc4,$carry + umulh $t3,$m3,$mi + adc $carry,xzr,xzr + ldr $mi,[sp,$cnt] // next t[0]*n0 + str $acc0,[$tp],#8 // result!!! + adds $acc0,$acc1,$t0 + sub $t0,$ap_end,$ap // done yet? + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 + adcs $acc3,$acc4,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_1st_tail + + sub $t1,$ap_end,$num // rewinded $ap + cbz $t0,.Lmul4x_proceed + + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + ldp $m0,$m1,[$np,#8*0] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr $bi,[$bp,#8*4]! // *++b + adc $topmost,$carry,xzr + ldp $a0,$a1,[$t1,#8*0] // a[0..3] + sub $np,$np,$num // rewind np + ldp $a2,$a3,[$t1,#8*2] + add $ap,$t1,#8*4 + + stp $acc0,$acc1,[$tp,#8*0] // result!!! + ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + stp $acc2,$acc3,[$tp,#8*2] // result!!! + ldp $acc2,$acc3,[sp,#8*6] + + ldp $m0,$m1,[$np,#8*0] // n[0..3] + mov $tp,sp + ldp $m2,$m3,[$np,#8*2] + adds $np,$np,#8*4 // clear carry bit + mov $carry,xzr + +.align 4 +.Loop_mul4x_reduction: + mul $t0,$a0,$bi // lo(a[0..3]*b[4]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) + adcs $acc1,$acc1,$t1 + mul $mi,$acc0,$n0 // t[0]*n0 + adcs $acc2,$acc2,$t2 + umulh $t1,$a1,$bi + adcs $acc3,$acc3,$t3 + umulh $t2,$a2,$bi + adc $acc4,xzr,xzr + umulh $t3,$a3,$bi + ldr $bi,[$bp,$cnt] // next b[i] + adds $acc1,$acc1,$t0 + // (*) mul $t0,$m0,$mi + str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + // (*) adds xzr,$acc0,$t0 + subs xzr,$acc0,#1 // (*) + umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 + adcs $acc0,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc1,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc2,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc3,$acc4,$carry + adc $carry,xzr,xzr + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_reduction + + adc $carry,$carry,xzr + ldp $t0,$t1,[$tp,#8*4] // t[4..7] + ldp $t2,$t3,[$tp,#8*6] + ldp $a0,$a1,[$ap,#8*0] // a[4..7] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + + ldr $mi,[sp] // t[0]*n0 + ldp $m0,$m1,[$np,#8*0] // n[4..7] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul $t0,$a0,$bi // lo(a[4..7]*b[4]) + adc $carry,$carry,xzr // modulo-scheduled + mul $t1,$a1,$bi + add $cnt,$cnt,#8 + mul $t2,$a2,$bi + and $cnt,$cnt,#31 + mul $t3,$a3,$bi + adds $acc0,$acc0,$t0 + umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) + adcs $acc1,$acc1,$t1 + umulh $t1,$a1,$bi + adcs $acc2,$acc2,$t2 + umulh $t2,$a2,$bi + adcs $acc3,$acc3,$t3 + umulh $t3,$a3,$bi + adc $acc4,xzr,xzr + ldr $bi,[$bp,$cnt] // next b[i] + adds $acc1,$acc1,$t0 + mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) + adcs $acc2,$acc2,$t1 + mul $t1,$m1,$mi + adcs $acc3,$acc3,$t2 + mul $t2,$m2,$mi + adc $acc4,$acc4,$t3 // can't overflow + mul $t3,$m3,$mi + adds $acc0,$acc0,$t0 + umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) + adcs $acc1,$acc1,$t1 + umulh $t1,$m1,$mi + adcs $acc2,$acc2,$t2 + umulh $t2,$m2,$mi + adcs $acc3,$acc3,$t3 + umulh $t3,$m3,$mi + adcs $acc4,$acc4,$carry + ldr $mi,[sp,$cnt] // next a[0]*n0 + adc $carry,xzr,xzr + str $acc0,[$tp],#8 // result!!! + adds $acc0,$acc1,$t0 + sub $t0,$ap_end,$ap // done yet? + adcs $acc1,$acc2,$t1 + adcs $acc2,$acc3,$t2 + adcs $acc3,$acc4,$t3 + //adc $carry,$carry,xzr + cbnz $cnt,.Loop_mul4x_tail + + sub $t1,$np,$num // rewinded np? + adc $carry,$carry,xzr + cbz $t0,.Loop_mul4x_break + + ldp $t0,$t1,[$tp,#8*4] + ldp $t2,$t3,[$tp,#8*6] + ldp $a0,$a1,[$ap,#8*0] + ldp $a2,$a3,[$ap,#8*2] + add $ap,$ap,#8*4 + adds $acc0,$acc0,$t0 + adcs $acc1,$acc1,$t1 + adcs $acc2,$acc2,$t2 + adcs $acc3,$acc3,$t3 + //adc $carry,$carry,xzr + ldp $m0,$m1,[$np,#8*0] + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp $t2,$t3,[x29,#96] // pull rp and &b[num] + adds $acc0,$acc0,$topmost + add $bp,$bp,#8*4 // bp++ + adcs $acc1,$acc1,xzr + sub $ap,$ap,$num // rewind ap + adcs $acc2,$acc2,xzr + stp $acc0,$acc1,[$tp,#8*0] // result!!! + adcs $acc3,$acc3,xzr + ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + adc $topmost,$carry,xzr + stp $acc2,$acc3,[$tp,#8*2] // result!!! + cmp $bp,$t3 // done yet? + ldp $acc2,$acc3,[sp,#8*6] + ldp $m0,$m1,[$t1,#8*0] // n[0..3] + ldp $m2,$m3,[$t1,#8*2] + add $np,$t1,#8*4 + b.eq .Lmul4x_post + + ldr $bi,[$bp] + ldp $a0,$a1,[$ap,#8*0] // a[0..3] + ldp $a2,$a3,[$ap,#8*2] + adds $ap,$ap,#8*4 // clear carry bit + mov $carry,xzr + mov $tp,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov $rp,$t2 + mov $ap_end,$t2 // $rp copy + subs $t0,$acc0,$m0 + add $tp,sp,#8*8 + sbcs $t1,$acc1,$m1 + sub $cnt,$num,#8*4 + +.Lmul4x_sub: + sbcs $t2,$acc2,$m2 + ldp $m0,$m1,[$np,#8*0] + sub $cnt,$cnt,#8*4 + ldp $acc0,$acc1,[$tp,#8*0] + sbcs $t3,$acc3,$m3 + ldp $m2,$m3,[$np,#8*2] + add $np,$np,#8*4 + ldp $acc2,$acc3,[$tp,#8*2] + add $tp,$tp,#8*4 + stp $t0,$t1,[$rp,#8*0] + sbcs $t0,$acc0,$m0 + stp $t2,$t3,[$rp,#8*2] + add $rp,$rp,#8*4 + sbcs $t1,$acc1,$m1 + cbnz $cnt,.Lmul4x_sub + + sbcs $t2,$acc2,$m2 + mov $tp,sp + add $ap,sp,#8*4 + ldp $a0,$a1,[$ap_end,#8*0] + sbcs $t3,$acc3,$m3 + stp $t0,$t1,[$rp,#8*0] + ldp $a2,$a3,[$ap_end,#8*2] + stp $t2,$t3,[$rp,#8*2] + ldp $acc0,$acc1,[$ap,#8*0] + ldp $acc2,$acc3,[$ap,#8*2] + sbcs xzr,$topmost,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub $cnt,$num,#8*4 +.Lmul4x_cond_copy: + sub $cnt,$cnt,#8*4 + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + ldp $a0,$a1,[$ap_end,#8*4] + ldp $acc0,$acc1,[$ap,#8*4] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*2] + add $tp,$tp,#8*4 + csel $t3,$acc3,$a3,lo + ldp $a2,$a3,[$ap_end,#8*6] + ldp $acc2,$acc3,[$ap,#8*6] + add $ap,$ap,#8*4 + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + add $ap_end,$ap_end,#8*4 + cbnz $cnt,.Lmul4x_cond_copy + + csel $t0,$acc0,$a0,lo + stp xzr,xzr,[$tp,#8*0] + csel $t1,$acc1,$a1,lo + stp xzr,xzr,[$tp,#8*2] + csel $t2,$acc2,$a2,lo + stp xzr,xzr,[$tp,#8*3] + csel $t3,$acc3,$a3,lo + stp xzr,xzr,[$tp,#8*4] + stp $t0,$t1,[$ap_end,#8*0] + stp $t2,$t3,[$ap_end,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc $carry,$carry,xzr + ldr $ap,[x29,#96] // pull rp + // $acc0-3,$carry hold result, $m0-7 hold modulus + subs $a0,$acc0,$m0 + ldr x30,[x29,#8] // pull return address + sbcs $a1,$acc1,$m1 + stp xzr,xzr,[sp,#8*0] + sbcs $a2,$acc2,$m2 + stp xzr,xzr,[sp,#8*2] + sbcs $a3,$acc3,$m3 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,$carry,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // $a0-3 hold result-modulus + csel $a0,$acc0,$a0,lo + csel $a1,$acc1,$a1,lo + csel $a2,$acc2,$a2,lo + csel $a3,$acc3,$a3,lo + stp $a0,$a1,[$ap,#8*0] + stp $a2,$a3,[$ap,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_mul4x_mont,.-__bn_mul4x_mont +___ +} +$code.=<<___; +.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 +___ + +print $code; + +close STDOUT; diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl index 332ef3e91d62..58effc8808dd 100644 --- a/crypto/bn/asm/bn-586.pl +++ b/crypto/bn/asm/bn-586.pl @@ -1,10 +1,20 @@ -#!/usr/local/bin/perl +#! /usr/bin/env perl +# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],$0); +$output = pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); $sse2=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -21,6 +31,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &asm_finish(); +close STDOUT; + sub bn_mul_add_words { local($name)=@_; @@ -42,7 +54,7 @@ sub bn_mul_add_words &movd("mm0",&wparam(3)); # mm0 = w &pxor("mm1","mm1"); # mm1 = carry_in &jmp(&label("maw_sse2_entry")); - + &set_label("maw_sse2_unrolled",16); &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0] &paddq("mm1","mm3"); # mm1 = carry_in + r[0] @@ -663,20 +675,20 @@ sub bn_sub_part_words &adc($c,0); &mov(&DWP($i*4,$r,"",0),$tmp1); # *r } - + &comment(""); &add($b,32); &add($r,32); &sub($num,8); &jnz(&label("pw_neg_loop")); - + &set_label("pw_neg_finish",0); &mov($tmp2,&wparam(4)); # get dl &mov($num,0); &sub($num,$tmp2); &and($num,7); &jz(&label("pw_end")); - + for ($i=0; $i<7; $i++) { &comment("dl<0 Tail Round $i"); @@ -693,9 +705,9 @@ sub bn_sub_part_words } &jmp(&label("pw_end")); - + &set_label("pw_pos",0); - + &and($num,0xfffffff8); # num / 8 &jz(&label("pw_pos_finish")); @@ -710,18 +722,18 @@ sub bn_sub_part_words &mov(&DWP($i*4,$r,"",0),$tmp1); # *r &jnc(&label("pw_nc".$i)); } - + &comment(""); &add($a,32); &add($r,32); &sub($num,8); &jnz(&label("pw_pos_loop")); - + &set_label("pw_pos_finish",0); &mov($num,&wparam(4)); # get dl &and($num,7); &jz(&label("pw_end")); - + for ($i=0; $i<7; $i++) { &comment("dl>0 Tail Round $i"); @@ -742,17 +754,17 @@ sub bn_sub_part_words &mov(&DWP($i*4,$r,"",0),$tmp1); # *r &set_label("pw_nc".$i,0); } - + &comment(""); &add($a,32); &add($r,32); &sub($num,8); &jnz(&label("pw_nc_loop")); - + &mov($num,&wparam(4)); # get dl &and($num,7); &jz(&label("pw_nc_end")); - + for ($i=0; $i<7; $i++) { &mov($tmp1,&DWP($i*4,$a,"",0)); # *a @@ -771,4 +783,3 @@ sub bn_sub_part_words &function_end($name); } - diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm new file mode 100644 index 000000000000..de6d37728fba --- /dev/null +++ b/crypto/bn/asm/bn-c64xplus.asm @@ -0,0 +1,382 @@ +;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +;; +;; Licensed under the OpenSSL license (the "License"). You may not use +;; this file except in compliance with the License. You can obtain a copy +;; in the file LICENSE in the source distribution or at +;; https://www.openssl.org/source/license.html +;; +;;==================================================================== +;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +;; project. +;; +;; Rights for redistribution and usage in source and binary forms are +;; granted according to the OpenSSL license. Warranty of any kind is +;; disclaimed. +;;==================================================================== +;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n +;; being the number of 32-bit words, addition - 8*n. Corresponding 4x +;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler +;; SPLOOPs spin at ... 2*n cycles [plus epilogue]. +;;==================================================================== + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg bn_mul_add_words,_bn_mul_add_words + .asg bn_mul_words,_bn_mul_words + .asg bn_sqr_words,_bn_sqr_words + .asg bn_add_words,_bn_add_words + .asg bn_sub_words,_bn_sub_words + .asg bn_div_words,_bn_div_words + .asg bn_sqr_comba8,_bn_sqr_comba8 + .asg bn_mul_comba8,_bn_mul_comba8 + .asg bn_sqr_comba4,_bn_sqr_comba4 + .asg bn_mul_comba4,_bn_mul_comba4 + .endif + + .asg B3,RA + .asg A4,ARG0 + .asg B4,ARG1 + .asg A6,ARG2 + .asg B6,ARG3 + .asg A8,ARG4 + .asg B8,ARG5 + .asg A4,RET + .asg A15,FP + .asg B14,DP + .asg B15,SP + + .global _bn_mul_add_words +_bn_mul_add_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator +|| [B0] MV ARG0,A2 +|| [B0] MV ARG3,A3 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 3 + LDW *ARG0++,A7 ; rp[i] + MPY32U B7,A3,A17:A16 + NOP 3 ; [2,0] in epilogue + ADDU A16,A7,A21:A20 + ADDU A19,A21:A20,A19:A18 +|| MV.S A17,A23 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*A2++ ; rp[i] +|| ADD A19,A23,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_mul_words +_bn_mul_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A19 ; high part of accumulator + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,A7 ; ap[i] + NOP 4 + MPY32U A7,ARG3,A17:A16 + NOP 4 ; [2,0] in epiloque + ADDU A19,A16,A19:A18 +|| MV.S A17,A21 + SPKERNEL 2,1 ; leave slot for "return value" +|| STW A18,*ARG0++ ; rp[i] +|| ADD.L A19,A21,A19 +;;==================================================================== + BNOP RA,4 + MV A19,RET ; return value + .endasmfunc + + .global _bn_sqr_words +_bn_sqr_words: + .asmfunc + MV ARG2,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] MV ARG0,B2 +|| [B0] ADD 4,ARG0,ARG0 + NOP 3 + + SPLOOP 2 ; 2*n+10 +;;==================================================================== + LDW *ARG1++,B7 ; ap[i] + NOP 4 + MPY32U B7,B7,B1:B0 + NOP 3 ; [2,0] in epilogue + STW B0,*B2++(8) ; rp[2*i] + MV B1,A1 + SPKERNEL 2,0 ; fully overlap BNOP RA,5 +|| STW A1,*ARG0++(8) ; rp[2*i+1] +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_add_words +_bn_add_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A1 ; carry flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + ADDU A7,B7,A9:A8 + ADDU A1,A9:A8,A1:A0 + SPKERNEL 0,0 ; fully overlap BNOP RA,5 +|| STW A0,*A3++ ; write result +|| MV A1,RET ; keep carry flag in RET +;;==================================================================== + BNOP RA,5 + .endasmfunc + + .global _bn_sub_words +_bn_sub_words: + .asmfunc + MV ARG3,B0 + [!B0] BNOP RA +||[!B0] MVK 0,RET + [B0] MVC B0,ILC + [B0] ZERO A2 ; borrow flag +|| [B0] MV ARG0,A3 + NOP 3 + + SPLOOP 2 ; 2*n+6 +;;==================================================================== + LDW *ARG2++,A7 ; bp[i] +|| LDW *ARG1++,B7 ; ap[i] + NOP 4 + SUBU B7,A7,A1:A0 + [A2] SUB A1:A0,1,A1:A0 + SPKERNEL 0,1 ; leave slot for "return borrow flag" +|| STW A0,*A3++ ; write result +|| AND 1,A1,A2 ; pass on borrow flag +;;==================================================================== + BNOP RA,4 + AND 1,A1,RET ; return borrow flag + .endasmfunc + + .global _bn_div_words +_bn_div_words: + .asmfunc + LMBD 1,A6,A0 ; leading zero bits in dv + LMBD 1,A4,A1 ; leading zero bits in hi +|| MVK 32,B0 + CMPLTU A1,A0,A2 +|| ADD A0,B0,B0 + [ A2] BNOP RA +||[ A2] MVK -1,A4 ; return overflow +||[!A2] MV A4,A3 ; reassign hi + [!A2] MV B4,A4 ; reassign lo, will be quotient +||[!A2] MVC B0,ILC + [!A2] SHL A6,A0,A6 ; normalize dv +|| MVK 1,A1 + + [!A2] CMPLTU A3,A6,A1 ; hi<dv? +||[!A2] SHL A4,1,A5:A4 ; lo<<1 + [!A1] SUB A3,A6,A3 ; hi-=dv +||[!A1] OR 1,A4,A4 + [!A2] SHRU A3,31,A1 ; upper bit +||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31 + + SPLOOP 3 + [!A1] CMPLTU A3,A6,A1 ; hi<dv? +||[ A1] ZERO A1 +|| SHL A4,1,A5:A4 ; lo<<1 + [!A1] SUB A3,A6,A3 ; hi-=dv +||[!A1] OR 1,A4,A4 ; quotient + SHRU A3,31,A1 ; upper bit +|| ADDAH A5,A3,A3 ; hi<<1|lo>>31 + SPKERNEL + + BNOP RA,5 + .endasmfunc + +;;==================================================================== +;; Not really Comba algorithm, just straightforward NxM... Dedicated +;; fully unrolled real Comba implementations are asymptotically 2x +;; faster, but naturally larger undertaking. Purpose of this exercise +;; was rather to learn to master nested SPLOOPs... +;;==================================================================== + .global _bn_sqr_comba8 + .global _bn_mul_comba8 +_bn_sqr_comba8: + MV ARG1,ARG2 +_bn_mul_comba8: + .asmfunc + MVK 8,B0 ; N, RILC +|| MVK 8,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; N-2, initial ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M +sploopNxM?: ; for best performance arrange M<=N + [A0] SPLOOPD 2 ; 2*n+10 +|| MVC B1,ILC +|| ADDAW B4,B0,B5 +|| ZERO B7 +|| LDW *A5++,A9 ; pre-fetch ap[1] +|| ZERO A1 +|| SUB A0,1,A0 +;;==================================================================== +;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files. +;; This is because of Advisory 15 from TI publication SPRZ247I. + LDW *ARG2++,A7 ; bp[i] + NOP 3 + [A1] LDW *B5++,B7 ; rp[i] + MPY32U A7,B6,B17:B16 + NOP 3 + ADDU B16,B7,B21:B20 + ADDU B19,B21:B20,B19:B18 +|| MV.S B17,B23 + SPKERNEL +|| STW B18,*B4++ ; rp[i] +|| ADD.S B19,B23,B19 +;;==================================================================== +outer?: ; m*2*(n+1)+10 + SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0] + SPMASKR +|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]? + MVD A9,B6 ; move through .M unit(*) + [A2] LDW *A5++,A9 ; pre-fetch ap[i+1] + SUBAW B5,B2,B5 ; rewind rp to rp[1] + MVK 1,A1 + [A0] BNOP.S1 outer?,4 +|| [A0] SUB.L A0,1,A0 + STW B19,*B4--[B2] ; rewind rp tp rp[1] +|| ZERO.S B19 ; high part of accumulator +;; end of outer? + BNOP RA,5 ; return + .endasmfunc +;; (*) It should be noted that B6 is used as input to MPY32U in +;; chronologically next cycle in *preceding* SPLOOP iteration. +;; Normally such arrangement would require DINT, but at this +;; point SPLOOP is draining and interrupts are disabled +;; implicitly. + + .global _bn_sqr_comba4 + .global _bn_mul_comba4 +_bn_sqr_comba4: + MV ARG1,ARG2 +_bn_mul_comba4: + .asmfunc + .if 0 + BNOP sploopNxM?,3 + ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case, + ;; because of low-counter effect, when prologue phase finishes + ;; before SPKERNEL instruction is reached. As result it's 25% + ;; slower than expected... + MVK 4,B0 ; N, RILC +|| MVK 4,A0 ; M, outer loop counter +|| MV ARG1,A5 ; copy ap +|| MV ARG0,B4 ; copy rp +|| ZERO B19 ; high part of accumulator + MVC B0,RILC +|| SUB B0,2,B1 ; first ILC +|| SUB B0,1,B2 ; const B2=N-1 +|| LDW *A5++,B6 ; ap[0] +|| MV A0,A3 ; const A3=M + .else + ;; This alternative is an exercise in fully unrolled Comba + ;; algorithm implementation that operates at n*(n+1)+12, or + ;; as little as 32 cycles... + LDW *ARG1[0],B16 ; a[0] +|| LDW *ARG2[0],A16 ; b[0] + LDW *ARG1[1],B17 ; a[1] +|| LDW *ARG2[1],A17 ; b[1] + LDW *ARG1[2],B18 ; a[2] +|| LDW *ARG2[2],A18 ; b[2] + LDW *ARG1[3],B19 ; a[3] +|| LDW *ARG2[3],A19 ; b[3] + NOP + MPY32U A16,B16,A1:A0 ; a[0]*b[0] + MPY32U A17,B16,A23:A22 ; a[0]*b[1] + MPY32U A16,B17,A25:A24 ; a[1]*b[0] + MPY32U A16,B18,A27:A26 ; a[2]*b[0] + STW A0,*ARG0[0] +|| MPY32U A17,B17,A29:A28 ; a[1]*b[1] + MPY32U A18,B16,A31:A30 ; a[0]*b[2] +|| ADDU A22,A1,A1:A0 + MV A23,B0 +|| MPY32U A19,B16,A21:A20 ; a[3]*b[0] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B0,B1:B0 +|| STW A0,*ARG0[1] +|| MPY32U A18,B17,A23:A22 ; a[2]*b[1] +|| ADDU A26,A1,A9:A8 + ADDU A27,B1,B9:B8 +|| MPY32U A17,B18,A25:A24 ; a[1]*b[2] +|| ADDU A28,A9:A8,A9:A8 + ADDU A29,B9:B8,B9:B8 +|| MPY32U A16,B19,A27:A26 ; a[0]*b[3] +|| ADDU A30,A9:A8,A9:A8 + ADDU A31,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[2] +|| ADDU A20,A9,A1:A0 + ADDU A21,B9,B1:B0 +|| MPY32U A19,B17,A21:A20 ; a[3]*b[1] +|| ADDU A22,A1:A0,A1:A0 + ADDU A23,B1:B0,B1:B0 +|| MPY32U A18,B18,A23:A22 ; a[2]*b[2] +|| ADDU A24,A1:A0,A1:A0 + ADDU A25,B1:B0,B1:B0 +|| MPY32U A17,B19,A25:A24 ; a[1]*b[3] +|| ADDU A26,A1:A0,A1:A0 + ADDU A27,B1:B0,B1:B0 +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[3] +|| MPY32U A19,B18,A27:A26 ; a[3]*b[2] +|| ADDU A20,A1,A9:A8 + ADDU A21,B1,B9:B8 +|| MPY32U A18,B19,A29:A28 ; a[2]*b[3] +|| ADDU A22,A9:A8,A9:A8 + ADDU A23,B9:B8,B9:B8 +|| MPY32U A19,B19,A31:A30 ; a[3]*b[3] +|| ADDU A24,A9:A8,A9:A8 + ADDU A25,B9:B8,B9:B8 +|| ADDU B0,A9:A8,A9:A8 + STW A8,*ARG0[4] +|| ADDU A26,A9,A1:A0 + ADDU A27,B9,B1:B0 +|| ADDU A28,A1:A0,A1:A0 + ADDU A29,B1:B0,B1:B0 +|| BNOP RA +|| ADDU B8,A1:A0,A1:A0 + STW A0,*ARG0[5] +|| ADDU A30,A1,A9:A8 + ADD A31,B1,B8 + ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below + ADD B8,A9,A9 +|| STW A8,*ARG0[6] + STW A9,*ARG0[7] + .endif + .endasmfunc diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl new file mode 100755 index 000000000000..9c46da3af8d1 --- /dev/null +++ b/crypto/bn/asm/c64xplus-gf2m.pl @@ -0,0 +1,160 @@ +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# February 2012 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... The subroutine runs in 37 cycles, which is +# 4.5x faster than compiler-generated code. Though comparison is +# totally unfair, because this module utilizes Galois Field Multiply +# instruction. + +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector + +($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20)); +($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20)); +($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7"); +($A,$B)=($Alo,$B_1); +$xFF="B1"; + +sub mul_1x1_upper { +my ($A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication +|| XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 + XORMPY $Alo,$B_0,$Alox0 +|| XORMPY $Ahi,$B_0,$Ahix0 + XORMPY $Alo,$B_3,$Alox3 +|| XORMPY $Ahi,$B_3,$Ahix3 + XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +___ +} +sub mul_1x1_merged { +my ($OUTlo,$OUThi,$A,$B)=@_; +$code.=<<___; + EXTU $B,8,24,$B_2 ; smash $B to 4 bytes +|| AND $B,$xFF,$B_0 +|| SHRU $B,24,$B_3 + SHRU $A,16, $Ahi ; smash $A to two halfwords +|| EXTU $A,16,16,$Alo + + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi +|| XORMPY $Alo,$B_2,$Alox2 + XORMPY $Ahi,$B_2,$Ahix2 +|| EXTU $B,16,24,$B_1 +|| XORMPY $Alo,$B_0,A1 ; $Alox0 + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| XORMPY $Ahi,$B_0,$Ahix0 +|| XORMPY $Alo,$B_3,$Alox3 +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| XORMPY $Ahi,$B_3,$Ahix3 +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +|| XORMPY $Alo,$B_1,$Alox1 +|| XORMPY $Ahi,$B_1,$Ahix1 +|| MV A1,$Alox0 +___ +} +sub mul_1x1_lower { +my ($OUTlo,$OUThi)=@_; +$code.=<<___; + ;NOP + XOR $Ahix0,$Alox2,$Ahix0 +|| MV $Ahix2,$OUThi + NOP + XOR $Ahix1,$Alox3,$Ahix1 +|| SHL $Ahix0,16,$OUTlo +|| SHRU $Ahix0,16,$Ahix0 + XOR $Alox0,$OUTlo,$OUTlo +|| XOR $Ahix0,$OUThi,$OUThi +|| SHL $Alox1,8,$Alox1 +|| SHL $Ahix3,8,$Ahix3 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix3,$OUThi,$OUThi +|| SHL $Ahix1,24,$Alox1 +|| SHRU $Ahix1,8, $Ahix1 + XOR $Alox1,$OUTlo,$OUTlo +|| XOR $Ahix1,$OUThi,$OUThi +___ +} +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2 + .endif + + .global _bn_GF2m_mul_2x2 +_bn_GF2m_mul_2x2: + .asmfunc + MVK 0xFF,$xFF +___ + &mul_1x1_upper($a0,$b0); # a0·b0 +$code.=<<___; +|| MV $b1,$B + MV $a1,$A +___ + &mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1 +$code.=<<___; +|| XOR $b0,$b1,$B + XOR $a0,$a1,$A +___ + &mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1) +$code.=<<___; + XOR A28,A31,A29 +|| XOR B28,B31,B29 ; a0·b0+a1·b1 +___ + &mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1) +$code.=<<___; +|| BNOP B3 + XOR A29,A30,A30 +|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1 + XOR B28,A30,A30 +|| STW A28,*${rp}[0] + XOR B30,A31,A31 +|| STW A30,*${rp}[1] + STW A31,*${rp}[2] + STW B31,*${rp}[3] + .endasmfunc +___ + +print $code; +close STDOUT; diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl index 57101a6bd775..97f5e3a19fc4 100644 --- a/crypto/bn/asm/co-586.pl +++ b/crypto/bn/asm/co-586.pl @@ -1,10 +1,19 @@ -#!/usr/local/bin/perl +#! /usr/bin/env perl +# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],$0); +$output = pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); &bn_mul_comba("bn_mul_comba8",8); &bn_mul_comba("bn_mul_comba4",4); @@ -13,6 +22,8 @@ require "x86asm.pl"; &asm_finish(); +close STDOUT; + sub mul_add_c { local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; @@ -36,7 +47,7 @@ sub mul_add_c &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b ### &adc($c2,0); - # is pos > 1, it means it is the last loop + # is pos > 1, it means it is the last loop &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a } @@ -65,7 +76,7 @@ sub sqr_add_c &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); ### &adc($c2,0); - # is pos > 1, it means it is the last loop + # is pos > 1, it means it is the last loop &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b } @@ -116,7 +127,7 @@ sub bn_mul_comba $c2="ebp"; $a="esi"; $b="edi"; - + $as=0; $ae=0; $bs=0; @@ -131,9 +142,9 @@ sub bn_mul_comba &push("ebx"); &xor($c0,$c0); - &mov("eax",&DWP(0,$a,"",0)); # load the first word + &mov("eax",&DWP(0,$a,"",0)); # load the first word &xor($c1,$c1); - &mov("edx",&DWP(0,$b,"",0)); # load the first second + &mov("edx",&DWP(0,$b,"",0)); # load the first second for ($i=0; $i<$tot; $i++) { @@ -141,7 +152,7 @@ sub bn_mul_comba $bi=$bs; $end=$be+1; - &comment("################## Calculate word $i"); + &comment("################## Calculate word $i"); for ($j=$bs; $j<$end; $j++) { diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl index e258658428a3..ec486f77792b 100755 --- a/crypto/bn/asm/ia64-mont.pl +++ b/crypto/bn/asm/ia64-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -60,6 +67,8 @@ # hereafter less for longer keys, while verify - by 74-13%. # DSA performance improves by 115-30%. +$output=pop; + if ($^O eq "hpux") { $ADDP="addp4"; for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } @@ -71,7 +80,7 @@ $code=<<___; // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, // const BN_ULONG *bp,const BN_ULONG *np, -// const BN_ULONG *n0p,int num); +// const BN_ULONG *n0p,int num); .align 64 .global bn_mul_mont# .proc bn_mul_mont# @@ -194,7 +203,7 @@ bn_mul_mont_general: { .mmi; .pred.rel "mutex",p39,p41 (p39) add topbit=r0,r0 (p41) add topbit=r0,r0,1 - nop.i 0 } + nop.i 0 } { .mmi; st8 [tp_1]=n[0] add tptr=16,sp add tp_1=8,sp };; @@ -332,19 +341,19 @@ bn_mul_mont_general: { .mmb; sub rptr=rptr,len // rewind sub tptr=tptr,len clrrrb.pr };; -{ .mmi; and aptr=tptr,topbit - andcm bptr=rptr,topbit +{ .mmi; mov aptr=rptr + mov bptr=tptr mov pr.rot=1<<16 };; -{ .mii; or nptr=aptr,bptr +{ .mii; cmp.eq p0,p6=topbit,r0 mov ar.lc=lc - mov ar.ec=3 };; + mov ar.ec=2 };; .Lcopy_ctop: -{ .mmb; (p16) ld8 n[0]=[nptr],8 - (p18) st8 [tptr]=r0,8 - (p16) nop.b 0 } -{ .mmb; (p16) nop.m 0 - (p18) st8 [rptr]=n[2],8 +{ .mmi; (p16) ld8 a[0]=[aptr],8 + (p16) ld8 t[0]=[bptr],8 + (p6) mov a[1]=t[1] };; // (p17) +{ .mmb; (p17) st8 [rptr]=a[1],8 + (p17) st8 [tptr]=r0,8 br.ctop.sptk .Lcopy_ctop };; .Lcopy_cend: @@ -846,6 +855,6 @@ copyright: stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" ___ -$output=shift and open STDOUT,">$output"; +open STDOUT,">$output" if $output; print $code; close STDOUT; diff --git a/crypto/bn/asm/ia64.S b/crypto/bn/asm/ia64.S index a9a42abfc302..d235c45e2d63 100644 --- a/crypto/bn/asm/ia64.S +++ b/crypto/bn/asm/ia64.S @@ -1,11 +1,18 @@ .explicit .text .ident "ia64.S, Version 2.1" -.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" +.ident "IA-64 ISA artwork by Andy Polyakov <appro@openssl.org>" + +// Copyright 2001-2018 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html // // ==================================================================== -// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL // project. // // Rights for redistribution and usage in source and binary forms are @@ -13,7 +20,7 @@ // disclaimed. // ==================================================================== // -// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is +// Version 2.x is Itanium2 re-tune. Few words about how Itanium2 is // different from Itanium to this module viewpoint. Most notably, is it // "wider" than Itanium? Can you experience loop scalability as // discussed in commentary sections? Not really:-( Itanium2 has 6 @@ -22,7 +29,7 @@ // ports is the same, i.e. 2, while I need 4. In other words, to this // module Itanium2 remains effectively as "wide" as Itanium. Yet it's // essentially different in respect to this module, and a re-tune was -// required. Well, because some intruction latencies has changed. Most +// required. Well, because some instruction latencies has changed. Most // noticeably those intensively used: // // Itanium Itanium2 @@ -134,7 +141,7 @@ // User Mask I want to excuse the kernel from preserving upper // (f32-f128) FP register bank over process context switch, thus // minimizing bus bandwidth consumption during the switch (i.e. -// after PKI opration completes and the program is off doing +// after PKI operation completes and the program is off doing // something else like bulk symmetric encryption). Having said // this, I also want to point out that it might be good idea // to compile the whole toolkit (as well as majority of the @@ -150,12 +157,15 @@ #else #define ADDP add #endif +#ifdef __VMS +.alias abort, "decc$abort" +#endif #if 1 // // bn_[add|sub]_words routines. // -// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the +// Loops are spinning in 2*(n+5) ticks on Itanium (provided that the // data reside in L1 cache, i.e. 2 ticks away). It's possible to // compress the epilogue and get down to 2*n+6, but at the cost of // scalability (the neat feature of this implementation is that it @@ -363,7 +373,7 @@ bn_mul_words: // The loop therefore spins at the latency of xma minus 1, or in other // words at 6*(n+4) ticks:-( Compare to the "production" loop above // that runs in 2*(n+11) where the low latency problem is worked around -// by moving the dependency to one-tick latent interger ALU. Note that +// by moving the dependency to one-tick latent integer ALU. Note that // "distance" between ldf8 and xma is not latency of ldf8, but the // *difference* between xma and ldf8 latencies. .L_bn_mul_words_ctop: @@ -425,7 +435,7 @@ bn_mul_add_words: // version was performing *all* additions in IALU and was starving // for those even on Itanium 2. In this version one addition is // moved to FPU and is folded with multiplication. This is at cost -// of propogating the result from previous call to this subroutine +// of propagating the result from previous call to this subroutine // to L2 cache... In other words negligible even for shorter keys. // *Overall* performance improvement [over previous version] varies // from 11 to 22 percent depending on key length. @@ -493,9 +503,9 @@ bn_sqr_words: // possible to compress the epilogue (I'm getting tired to write this // comment over and over) and get down to 2*n+16 at the cost of // scalability. The decision will very likely be reconsidered after the -// benchmark program is profiled. I.e. if perfomance gain on Itanium +// benchmark program is profiled. I.e. if performance gain on Itanium // will appear larger than loss on "wider" IA-64, then the loop should -// be explicitely split and the epilogue compressed. +// be explicitly split and the epilogue compressed. .L_bn_sqr_words_ctop: { .mfi; (p16) ldf8 f32=[r33],8 (p25) xmpy.lu f42=f41,f41 @@ -929,7 +939,7 @@ bn_mul_comba8: xma.hu f118=f39,f127,f117 } { .mfi; xma.lu f117=f39,f127,f117 };;// //-------------------------------------------------// -// Leaving muliplier's heaven... Quite a ride, huh? +// Leaving multiplier's heaven... Quite a ride, huh? { .mii; getf.sig r31=f47 add r25=r25,r24 @@ -1421,6 +1431,7 @@ bn_div_words: mov ar.ec=0 // don't rotate at exit mov pr.rot=0 } { .mii; mov L=r33 // save l + mov r25=r0 // needed if abort is called on VMS mov r36=r0 };; .L_divw_shift: // -vv- note signed comparison @@ -1522,9 +1533,8 @@ bn_div_words: // output: f8 = (int)(a/b) // clobbered: f8,f9,f10,f11,pred pred=p15 -// One can argue that this snippet is copyrighted to Intel -// Corporation, as it's essentially identical to one of those -// found in "Divide, Square Root and Remainder" section at +// This snippet is based on text found in the "Divide, Square +// Root and Remainder" section at // http://www.intel.com/software/products/opensource/libraries/num.htm. // Yes, I admit that the referred code was used as template, // but after I realized that there hardly is any other instruction diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl index a33cdf411121..fbe5d04f716c 100755 --- a/crypto/bn/asm/mips-mont.pl +++ b/crypto/bn/asm/mips-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -49,14 +56,14 @@ $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 if ($flavour =~ /64|n32/i) { - $PTR_ADD="dadd"; # incidentally works even on n32 - $PTR_SUB="dsub"; # incidentally works even on n32 + $PTR_ADD="daddu"; # incidentally works even on n32 + $PTR_SUB="dsubu"; # incidentally works even on n32 $REG_S="sd"; $REG_L="ld"; $SZREG=8; } else { - $PTR_ADD="add"; - $PTR_SUB="sub"; + $PTR_ADD="addu"; + $PTR_SUB="subu"; $REG_S="sw"; $REG_L="lw"; $SZREG=4; @@ -67,7 +74,7 @@ $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; # ###################################################################### -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($flavour =~ /64|n32/i) { @@ -114,6 +121,8 @@ $m1=$s11; $FRAMESIZE=14; $code=<<___; +#include "mips_arch.h" + .text .set noat @@ -176,27 +185,27 @@ $code.=<<___; $PTR_SUB $sp,$num and $sp,$at - $MULTU $aj,$bi - $LD $alo,$BNSZ($ap) - $LD $nlo,$BNSZ($np) - mflo $lo0 - mfhi $hi0 - $MULTU $lo0,$n0 - mflo $m1 - - $MULTU $alo,$bi - mflo $alo - mfhi $ahi - - $MULTU $nj,$m1 - mflo $lo1 - mfhi $hi1 - $MULTU $nlo,$m1 + $MULTU ($aj,$bi) + $LD $ahi,$BNSZ($ap) + $LD $nhi,$BNSZ($np) + mflo ($lo0,$aj,$bi) + mfhi ($hi0,$aj,$bi) + $MULTU ($lo0,$n0) + mflo ($m1,$lo0,$n0) + + $MULTU ($ahi,$bi) + mflo ($alo,$ahi,$bi) + mfhi ($ahi,$ahi,$bi) + + $MULTU ($nj,$m1) + mflo ($lo1,$nj,$m1) + mfhi ($hi1,$nj,$m1) + $MULTU ($nhi,$m1) $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at - mflo $nlo - mfhi $nhi + mflo ($nlo,$nhi,$m1) + mfhi ($nhi,$nhi,$m1) move $tp,$sp li $j,2*$BNSZ @@ -208,25 +217,25 @@ $code.=<<___; $LD $aj,($aj) $LD $nj,($nj) - $MULTU $aj,$bi + $MULTU ($aj,$bi) $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 - mflo $alo - mfhi $ahi + mflo ($alo,$aj,$bi) + mfhi ($ahi,$aj,$bi) $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 - $MULTU $nj,$m1 + $MULTU ($nj,$m1) $ADDU $hi1,$at addu $j,$BNSZ $ST $lo1,($tp) sltu $t0,$j,$num - mflo $nlo - mfhi $nhi + mflo ($nlo,$nj,$m1) + mfhi ($nhi,$nj,$m1) bnez $t0,.L1st $PTR_ADD $tp,$BNSZ @@ -256,34 +265,34 @@ $code.=<<___; $PTR_ADD $bi,$bp,$i $LD $bi,($bi) $LD $aj,($ap) - $LD $alo,$BNSZ($ap) + $LD $ahi,$BNSZ($ap) $LD $tj,($sp) - $MULTU $aj,$bi + $MULTU ($aj,$bi) $LD $nj,($np) - $LD $nlo,$BNSZ($np) - mflo $lo0 - mfhi $hi0 + $LD $nhi,$BNSZ($np) + mflo ($lo0,$aj,$bi) + mfhi ($hi0,$aj,$bi) $ADDU $lo0,$tj - $MULTU $lo0,$n0 + $MULTU ($lo0,$n0) sltu $at,$lo0,$tj $ADDU $hi0,$at - mflo $m1 + mflo ($m1,$lo0,$n0) - $MULTU $alo,$bi - mflo $alo - mfhi $ahi + $MULTU ($ahi,$bi) + mflo ($alo,$ahi,$bi) + mfhi ($ahi,$ahi,$bi) - $MULTU $nj,$m1 - mflo $lo1 - mfhi $hi1 + $MULTU ($nj,$m1) + mflo ($lo1,$nj,$m1) + mfhi ($hi1,$nj,$m1) - $MULTU $nlo,$m1 + $MULTU ($nhi,$m1) $ADDU $lo1,$lo0 sltu $at,$lo1,$lo0 $ADDU $hi1,$at - mflo $nlo - mfhi $nhi + mflo ($nlo,$nhi,$m1) + mfhi ($nhi,$nhi,$m1) move $tp,$sp li $j,2*$BNSZ @@ -296,19 +305,19 @@ $code.=<<___; $LD $aj,($aj) $LD $nj,($nj) - $MULTU $aj,$bi + $MULTU ($aj,$bi) $ADDU $lo0,$alo,$hi0 $ADDU $lo1,$nlo,$hi1 sltu $at,$lo0,$hi0 sltu $t0,$lo1,$hi1 $ADDU $hi0,$ahi,$at $ADDU $hi1,$nhi,$t0 - mflo $alo - mfhi $ahi + mflo ($alo,$aj,$bi) + mfhi ($ahi,$aj,$bi) $ADDU $lo0,$tj addu $j,$BNSZ - $MULTU $nj,$m1 + $MULTU ($nj,$m1) sltu $at,$lo0,$tj $ADDU $lo1,$lo0 $ADDU $hi0,$at @@ -316,8 +325,8 @@ $code.=<<___; $LD $tj,2*$BNSZ($tp) $ADDU $hi1,$t0 sltu $at,$j,$num - mflo $nlo - mfhi $nhi + mflo ($nlo,$nj,$m1) + mfhi ($nhi,$nj,$m1) $ST $lo1,($tp) bnez $at,.Linner $PTR_ADD $tp,$BNSZ @@ -377,15 +386,13 @@ $code.=<<___; $PTR_SUB $rp,$num # restore rp not $hi1,$hi0 - and $ap,$hi0,$sp - and $bp,$hi1,$rp - or $ap,$ap,$bp # ap=borrow?tp:rp - -.align 4 -.Lcopy: $LD $aj,($ap) - $PTR_ADD $ap,$BNSZ +.Lcopy: $LD $nj,($tp) # conditional move + $LD $aj,($rp) $ST $zero,($tp) $PTR_ADD $tp,$BNSZ + and $nj,$hi0 + and $aj,$hi1 + or $aj,$nj sltu $at,$tp,$tj $ST $aj,($rp) bnez $at,.Lcopy diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl index acafde5e5685..da35ec1b30ce 100755 --- a/crypto/bn/asm/mips.pl +++ b/crypto/bn/asm/mips.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. # # Rights for redistribution and usage in source and binary forms are @@ -15,7 +22,7 @@ # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. # # The module is designed to work with either of the "new" MIPS ABI(5), -# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under +# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under # IRIX 5.x not only because it doesn't support new ABIs but also # because 5.x kernels put R4x00 CPU into 32-bit mode and all those # 64-bit instructions (daddu, dmultu, etc.) found below gonna only @@ -35,7 +42,7 @@ # Performance improvement is astonishing! 'apps/openssl speed rsa dsa' # goes way over 3 times faster! # -# <appro@fy.chalmers.se> +# <appro@openssl.org> # October 2010 # @@ -49,7 +56,7 @@ # key length, more for longer keys. $flavour = shift || "o32"; -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($flavour =~ /64|n32/i) { @@ -102,6 +109,22 @@ $gp=$v1 if ($flavour =~ /nubi/i); $minus4=$v1; $code.=<<___; +#include "mips_arch.h" + +#if defined(_MIPS_ARCH_MIPS64R6) +# define ddivu(rs,rt) +# define mfqt(rd,rs,rt) ddivu rd,rs,rt +# define mfrm(rd,rs,rt) dmodu rd,rs,rt +#elif defined(_MIPS_ARCH_MIPS32R6) +# define divu(rs,rt) +# define mfqt(rd,rs,rt) divu rd,rs,rt +# define mfrm(rd,rs,rt) modu rd,rs,rt +#else +# define $DIVU(rs,rt) $DIVU $zero,rs,rt +# define mfqt(rd,rs,rt) mflo rd +# define mfrm(rd,rs,rt) mfhi rd +#endif + .rdata .asciiz "mips3.s, Version 1.2" .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" @@ -144,7 +167,7 @@ $code.=<<___; .L_bn_mul_add_words_loop: $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,0($a0) $LD $t2,$BNSZ($a1) $LD $t3,$BNSZ($a0) @@ -154,11 +177,11 @@ $code.=<<___; sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit # values", but it seems to work fine # even on 64-bit registers. - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 - $MULTU $t2,$a3 + $MULTU ($t2,$a3) sltu $at,$t1,$at $ST $t1,0($a0) $ADDU $v0,$at @@ -167,11 +190,11 @@ $code.=<<___; $LD $ta3,3*$BNSZ($a0) $ADDU $t3,$v0 sltu $v0,$t3,$v0 - mflo $at - mfhi $t2 + mflo ($at,$t2,$a3) + mfhi ($t2,$t2,$a3) $ADDU $t3,$at $ADDU $v0,$t2 - $MULTU $ta0,$a3 + $MULTU ($ta0,$a3) sltu $at,$t3,$at $ST $t3,$BNSZ($a0) $ADDU $v0,$at @@ -181,11 +204,11 @@ $code.=<<___; $PTR_ADD $a1,4*$BNSZ $ADDU $ta1,$v0 sltu $v0,$ta1,$v0 - mflo $at - mfhi $ta0 + mflo ($at,$ta0,$a3) + mfhi ($ta0,$ta0,$a3) $ADDU $ta1,$at $ADDU $v0,$ta0 - $MULTU $ta2,$a3 + $MULTU ($ta2,$a3) sltu $at,$ta1,$at $ST $ta1,-2*$BNSZ($a0) $ADDU $v0,$at @@ -194,8 +217,8 @@ $code.=<<___; and $ta0,$a2,$minus4 $ADDU $ta3,$v0 sltu $v0,$ta3,$v0 - mflo $at - mfhi $ta2 + mflo ($at,$ta2,$a3) + mfhi ($ta2,$ta2,$a3) $ADDU $ta3,$at $ADDU $v0,$ta2 sltu $at,$ta3,$at @@ -210,13 +233,13 @@ $code.=<<___; .L_bn_mul_add_words_tail: .set reorder $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,0($a0) subu $a2,1 $ADDU $t1,$v0 sltu $v0,$t1,$v0 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 sltu $at,$t1,$at @@ -225,13 +248,13 @@ $code.=<<___; beqz $a2,.L_bn_mul_add_words_return $LD $t0,$BNSZ($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,$BNSZ($a0) subu $a2,1 $ADDU $t1,$v0 sltu $v0,$t1,$v0 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 sltu $at,$t1,$at @@ -240,12 +263,12 @@ $code.=<<___; beqz $a2,.L_bn_mul_add_words_return $LD $t0,2*$BNSZ($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t1,2*$BNSZ($a0) $ADDU $t1,$v0 sltu $v0,$t1,$v0 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $t1,$at $ADDU $v0,$t0 sltu $at,$t1,$at @@ -303,40 +326,40 @@ $code.=<<___; .L_bn_mul_words_loop: $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) $LD $t2,$BNSZ($a1) $LD $ta0,2*$BNSZ($a1) $LD $ta2,3*$BNSZ($a1) - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at - $MULTU $t2,$a3 + $MULTU ($t2,$a3) $ST $v0,0($a0) $ADDU $v0,$t1,$t0 subu $a2,4 $PTR_ADD $a0,4*$BNSZ $PTR_ADD $a1,4*$BNSZ - mflo $at - mfhi $t2 + mflo ($at,$t2,$a3) + mfhi ($t2,$t2,$a3) $ADDU $v0,$at sltu $t3,$v0,$at - $MULTU $ta0,$a3 + $MULTU ($ta0,$a3) $ST $v0,-3*$BNSZ($a0) $ADDU $v0,$t3,$t2 - mflo $at - mfhi $ta0 + mflo ($at,$ta0,$a3) + mfhi ($ta0,$ta0,$a3) $ADDU $v0,$at sltu $ta1,$v0,$at - $MULTU $ta2,$a3 + $MULTU ($ta2,$a3) $ST $v0,-2*$BNSZ($a0) $ADDU $v0,$ta1,$ta0 and $ta0,$a2,$minus4 - mflo $at - mfhi $ta2 + mflo ($at,$ta2,$a3) + mfhi ($ta2,$ta2,$a3) $ADDU $v0,$at sltu $ta3,$v0,$at $ST $v0,-$BNSZ($a0) @@ -350,10 +373,10 @@ $code.=<<___; .L_bn_mul_words_tail: .set reorder $LD $t0,0($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) subu $a2,1 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at $ST $v0,0($a0) @@ -361,10 +384,10 @@ $code.=<<___; beqz $a2,.L_bn_mul_words_return $LD $t0,$BNSZ($a1) - $MULTU $t0,$a3 + $MULTU ($t0,$a3) subu $a2,1 - mflo $at - mfhi $t0 + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at $ST $v0,$BNSZ($a0) @@ -372,9 +395,9 @@ $code.=<<___; beqz $a2,.L_bn_mul_words_return $LD $t0,2*$BNSZ($a1) - $MULTU $t0,$a3 - mflo $at - mfhi $t0 + $MULTU ($t0,$a3) + mflo ($at,$t0,$a3) + mfhi ($t0,$t0,$a3) $ADDU $v0,$at sltu $t1,$v0,$at $ST $v0,2*$BNSZ($a0) @@ -431,35 +454,35 @@ $code.=<<___; .L_bn_sqr_words_loop: $LD $t0,0($a1) - $MULTU $t0,$t0 + $MULTU ($t0,$t0) $LD $t2,$BNSZ($a1) $LD $ta0,2*$BNSZ($a1) $LD $ta2,3*$BNSZ($a1) - mflo $t1 - mfhi $t0 + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,0($a0) $ST $t0,$BNSZ($a0) - $MULTU $t2,$t2 + $MULTU ($t2,$t2) subu $a2,4 $PTR_ADD $a0,8*$BNSZ $PTR_ADD $a1,4*$BNSZ - mflo $t3 - mfhi $t2 + mflo ($t3,$t2,$t2) + mfhi ($t2,$t2,$t2) $ST $t3,-6*$BNSZ($a0) $ST $t2,-5*$BNSZ($a0) - $MULTU $ta0,$ta0 - mflo $ta1 - mfhi $ta0 + $MULTU ($ta0,$ta0) + mflo ($ta1,$ta0,$ta0) + mfhi ($ta0,$ta0,$ta0) $ST $ta1,-4*$BNSZ($a0) $ST $ta0,-3*$BNSZ($a0) - $MULTU $ta2,$ta2 + $MULTU ($ta2,$ta2) and $ta0,$a2,$minus4 - mflo $ta3 - mfhi $ta2 + mflo ($ta3,$ta2,$ta2) + mfhi ($ta2,$ta2,$ta2) $ST $ta3,-2*$BNSZ($a0) .set noreorder @@ -472,27 +495,27 @@ $code.=<<___; .L_bn_sqr_words_tail: .set reorder $LD $t0,0($a1) - $MULTU $t0,$t0 + $MULTU ($t0,$t0) subu $a2,1 - mflo $t1 - mfhi $t0 + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,0($a0) $ST $t0,$BNSZ($a0) beqz $a2,.L_bn_sqr_words_return $LD $t0,$BNSZ($a1) - $MULTU $t0,$t0 + $MULTU ($t0,$t0) subu $a2,1 - mflo $t1 - mfhi $t0 + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,2*$BNSZ($a0) $ST $t0,3*$BNSZ($a0) beqz $a2,.L_bn_sqr_words_return $LD $t0,2*$BNSZ($a1) - $MULTU $t0,$t0 - mflo $t1 - mfhi $t0 + $MULTU ($t0,$t0) + mflo ($t1,$t0,$t0) + mfhi ($t0,$t0,$t0) $ST $t1,4*$BNSZ($a0) $ST $t0,5*$BNSZ($a0) @@ -580,13 +603,13 @@ $code.=<<___; sltu $v0,$t2,$ta2 $ST $t2,-2*$BNSZ($a0) $ADDU $v0,$t8 - + $ADDU $ta3,$t3 sltu $t9,$ta3,$t3 $ADDU $t3,$ta3,$v0 sltu $v0,$t3,$ta3 $ST $t3,-$BNSZ($a0) - + .set noreorder bgtz $at,.L_bn_add_words_loop $ADDU $v0,$t9 @@ -785,7 +808,7 @@ bn_div_3_words: # so that we can save two arguments # and return address in registers # instead of stack:-) - + $LD $a0,($a3) move $ta2,$a1 bne $a0,$a2,bn_div_3_words_internal @@ -816,11 +839,11 @@ $code.=<<___; move $ta3,$ra bal bn_div_words_internal move $ra,$ta3 - $MULTU $ta2,$v0 + $MULTU ($ta2,$v0) $LD $t2,-2*$BNSZ($a3) move $ta0,$zero - mfhi $t1 - mflo $t0 + mfhi ($t1,$ta2,$v0) + mflo ($t0,$ta2,$v0) sltu $t8,$t1,$a1 .L_bn_div_3_words_inner_loop: bnez $t8,.L_bn_div_3_words_inner_loop_done @@ -923,15 +946,15 @@ $code.=<<___; $SRL $HH,$a0,4*$BNSZ # bits $SRL $QT,4*$BNSZ # q=0xffffffff beq $DH,$HH,.L_bn_div_words_skip_div1 - $DIVU $zero,$a0,$DH - mflo $QT + $DIVU ($a0,$DH) + mfqt ($QT,$a0,$DH) .L_bn_div_words_skip_div1: - $MULTU $a2,$QT + $MULTU ($a2,$QT) $SLL $t3,$a0,4*$BNSZ # bits $SRL $at,$a1,4*$BNSZ # bits or $t3,$at - mflo $t0 - mfhi $t1 + mflo ($t0,$a2,$QT) + mfhi ($t1,$a2,$QT) .L_bn_div_words_inner_loop1: sltu $t2,$t3,$t0 seq $t8,$HH,$t1 @@ -956,15 +979,15 @@ $code.=<<___; $SRL $HH,$a0,4*$BNSZ # bits $SRL $QT,4*$BNSZ # q=0xffffffff beq $DH,$HH,.L_bn_div_words_skip_div2 - $DIVU $zero,$a0,$DH - mflo $QT + $DIVU ($a0,$DH) + mfqt ($QT,$a0,$DH) .L_bn_div_words_skip_div2: - $MULTU $a2,$QT + $MULTU ($a2,$QT) $SLL $t3,$a0,4*$BNSZ # bits $SRL $at,$a1,4*$BNSZ # bits or $t3,$at - mflo $t0 - mfhi $t1 + mflo ($t0,$a2,$QT) + mfhi ($t1,$a2,$QT) .L_bn_div_words_inner_loop2: sltu $t2,$t3,$t0 seq $t8,$HH,$t1 @@ -1063,592 +1086,592 @@ $code.=<<___; $LD $b_0,0($a2) $LD $a_1,$BNSZ($a1) $LD $a_2,2*$BNSZ($a1) - $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_3,3*$BNSZ($a1) $LD $b_1,$BNSZ($a2) $LD $b_2,2*$BNSZ($a2) $LD $b_3,3*$BNSZ($a2) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$b_0) + mfhi ($c_2,$a_0,$b_0) $LD $a_4,4*$BNSZ($a1) $LD $a_5,5*$BNSZ($a1) - $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); $LD $a_6,6*$BNSZ($a1) $LD $a_7,7*$BNSZ($a1) $LD $b_4,4*$BNSZ($a2) $LD $b_5,5*$BNSZ($a2) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_1) + mfhi ($t_2,$a_0,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); $ADDU $c_3,$t_2,$at $LD $b_6,6*$BNSZ($a2) $LD $b_7,7*$BNSZ($a2) $ST $c_1,0($a0) # r[0]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_0) + mfhi ($t_2,$a_1,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 $ST $c_2,$BNSZ($a0) # r[1]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_0) + mfhi ($t_2,$a_2,$b_0) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_1) + mfhi ($t_2,$a_1,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_2) + mfhi ($t_2,$a_0,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) # r[2]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_3) + mfhi ($t_2,$a_0,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_2) + mfhi ($t_2,$a_1,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_1) + mfhi ($t_2,$a_2,$b_1) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_0) + mfhi ($t_2,$a_3,$b_0) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); + $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,3*$BNSZ($a0) # r[3]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_0) + mfhi ($t_2,$a_4,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_1) + mfhi ($t_2,$a_3,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_2) + mfhi ($t_2,$a_2,$b_2) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_3) + mfhi ($t_2,$a_1,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); + $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_4) + mfhi ($t_2,$a_0,$b_4) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); + $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) # r[4]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_5) + mfhi ($t_2,$a_0,$b_5) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); + $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_4) + mfhi ($t_2,$a_1,$b_4) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_3) + mfhi ($t_2,$a_2,$b_3) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_2) + mfhi ($t_2,$a_3,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); + $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_1) + mfhi ($t_2,$a_4,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); + $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_0) + mfhi ($t_2,$a_5,$b_0) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); + $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,5*$BNSZ($a0) # r[5]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_0) + mfhi ($t_2,$a_6,$b_0) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); + $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_1) + mfhi ($t_2,$a_5,$b_1) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); + $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_2) + mfhi ($t_2,$a_4,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_3) + mfhi ($t_2,$a_3,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); + $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_4) + mfhi ($t_2,$a_2,$b_4) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); + $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_5) + mfhi ($t_2,$a_1,$b_5) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); + $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_6) + mfhi ($t_2,$a_0,$b_6) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); + $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) # r[6]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_7) + mfhi ($t_2,$a_0,$b_7) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); + $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_6) + mfhi ($t_2,$a_1,$b_6) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); + $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_5) + mfhi ($t_2,$a_2,$b_5) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); + $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_4) + mfhi ($t_2,$a_3,$b_4) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); + $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_3) + mfhi ($t_2,$a_4,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); + $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_2) + mfhi ($t_2,$a_5,$b_2) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); + $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_1) + mfhi ($t_2,$a_6,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); + $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_0) + mfhi ($t_2,$a_7,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); + $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,7*$BNSZ($a0) # r[7]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_1) + mfhi ($t_2,$a_7,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); + $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_2) + mfhi ($t_2,$a_6,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); + $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_3) + mfhi ($t_2,$a_5,$b_3) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_4) + mfhi ($t_2,$a_4,$b_4) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); + $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_5) + mfhi ($t_2,$a_3,$b_5) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); + $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_6) + mfhi ($t_2,$a_2,$b_6) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); + $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_7) + mfhi ($t_2,$a_1,$b_7) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); + $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) # r[8]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_7) + mfhi ($t_2,$a_2,$b_7) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); + $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_6) + mfhi ($t_2,$a_3,$b_6) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); + $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_5) + mfhi ($t_2,$a_4,$b_5) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); + $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_4) + mfhi ($t_2,$a_5,$b_4) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); + $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_3) + mfhi ($t_2,$a_6,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); + $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_2) + mfhi ($t_2,$a_7,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); + $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,9*$BNSZ($a0) # r[9]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_3) + mfhi ($t_2,$a_7,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); + $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_4) + mfhi ($t_2,$a_6,$b_4) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_5) + mfhi ($t_2,$a_5,$b_5) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); + $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_6) + mfhi ($t_2,$a_4,$b_6) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); + $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_7) + mfhi ($t_2,$a_3,$b_7) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); + $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) # r[10]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_4,$b_7) + mfhi ($t_2,$a_4,$b_7) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); + $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_6) + mfhi ($t_2,$a_5,$b_6) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); + $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_5) + mfhi ($t_2,$a_6,$b_5) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); + $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_4) + mfhi ($t_2,$a_7,$b_4) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); + $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,11*$BNSZ($a0) # r[11]=c3; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_5) + mfhi ($t_2,$a_7,$b_5) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); + $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_6) + mfhi ($t_2,$a_6,$b_6) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); + $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_5,$b_7) + mfhi ($t_2,$a_5,$b_7) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); + $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) # r[12]=c1; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_6,$b_7) + mfhi ($t_2,$a_6,$b_7) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); + $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_6) + mfhi ($t_2,$a_7,$b_6) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); + $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,13*$BNSZ($a0) # r[13]=c2; - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_7,$b_7) + mfhi ($t_2,$a_7,$b_7) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at @@ -1709,144 +1732,144 @@ $code.=<<___; $LD $b_0,0($a2) $LD $a_1,$BNSZ($a1) $LD $a_2,2*$BNSZ($a1) - $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_3,3*$BNSZ($a1) $LD $b_1,$BNSZ($a2) $LD $b_2,2*$BNSZ($a2) $LD $b_3,3*$BNSZ($a2) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$b_0) + mfhi ($c_2,$a_0,$b_0) $ST $c_1,0($a0) - $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); - mflo $t_1 - mfhi $t_2 + $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); + mflo ($t_1,$a_0,$b_1) + mfhi ($t_2,$a_0,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); $ADDU $c_3,$t_2,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_0) + mfhi ($t_2,$a_1,$b_0) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 $ST $c_2,$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_0) + mfhi ($t_2,$a_2,$b_0) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_1) + mfhi ($t_2,$a_1,$b_1) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_2) + mfhi ($t_2,$a_0,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_0,$b_3) + mfhi ($t_2,$a_0,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $c_3,$c_2,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_2) + mfhi ($t_2,$a_1,$b_2) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_1) + mfhi ($t_2,$a_2,$b_1) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_0) + mfhi ($t_2,$a_3,$b_0) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,3*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_1) + mfhi ($t_2,$a_3,$b_1) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $c_1,$c_3,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_2) + mfhi ($t_2,$a_2,$b_2) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_1,$b_3) + mfhi ($t_2,$a_1,$b_3) $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_2,$b_3) + mfhi ($t_2,$a_2,$b_3) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $c_2,$c_1,$t_2 - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_2) + mfhi ($t_2,$a_3,$b_2) $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,5*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 + mflo ($t_1,$a_3,$b_3) + mfhi ($t_2,$a_3,$b_3) $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at @@ -1881,11 +1904,9 @@ my ($hi,$lo,$c0,$c1,$c2, # commented as "forward multiplication" below]; )=@_; $code.=<<___; - mflo $lo - mfhi $hi $ADDU $c0,$lo sltu $at,$c0,$lo - $MULTU $an,$bn # forward multiplication + $MULTU ($an,$bn) # forward multiplication $ADDU $c0,$lo $ADDU $at,$hi sltu $lo,$c0,$lo @@ -1895,15 +1916,17 @@ ___ $code.=<<___ if (!$warm); sltu $c2,$c1,$at $ADDU $c1,$hi - sltu $hi,$c1,$hi - $ADDU $c2,$hi ___ $code.=<<___ if ($warm); sltu $at,$c1,$at $ADDU $c1,$hi $ADDU $c2,$at +___ +$code.=<<___; sltu $hi,$c1,$hi $ADDU $c2,$hi + mflo ($lo,$an,$bn) + mfhi ($hi,$an,$bn) ___ } @@ -1933,21 +1956,21 @@ $code.=<<___; $LD $a_2,2*$BNSZ($a1) $LD $a_3,3*$BNSZ($a1) - $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_4,4*$BNSZ($a1) $LD $a_5,5*$BNSZ($a1) $LD $a_6,6*$BNSZ($a1) $LD $a_7,7*$BNSZ($a1) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$a_0) + mfhi ($c_2,$a_0,$a_0) $ST $c_1,0($a0) - $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); - mflo $t_1 - mfhi $t_2 + $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo ($t_1,$a_0,$a_1) + mfhi ($t_2,$a_0,$a_1) slt $c_1,$t_2,$zero $SLL $t_2,1 - $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 @@ -1955,20 +1978,22 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) + mflo ($t_1,$a_2,$a_0) + mfhi ($t_2,$a_2,$a_0) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) + mflo ($t_1,$a_0,$a_3) + mfhi ($t_2,$a_0,$a_3) ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); @@ -1982,16 +2007,16 @@ ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); + $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) + mflo ($t_1,$a_0,$a_5) + mfhi ($t_2,$a_0,$a_5) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); @@ -2009,16 +2034,16 @@ ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); + $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,6*$BNSZ($a0) + mflo ($t_1,$a_0,$a_7) + mfhi ($t_2,$a_0,$a_7) ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); @@ -2038,16 +2063,16 @@ ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); + $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,8*$BNSZ($a0) + mflo ($t_1,$a_2,$a_7) + mfhi ($t_2,$a_2,$a_7) ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); @@ -2063,16 +2088,16 @@ ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); + $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,10*$BNSZ($a0) + mflo ($t_1,$a_4,$a_7) + mfhi ($t_2,$a_4,$a_7) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); @@ -2084,24 +2109,22 @@ ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 - $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); + $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1); $ADDU $t_2,$at $ADDU $c_2,$t_2 sltu $at,$c_2,$t_2 $ADDU $c_3,$at $ST $c_1,12*$BNSZ($a0) + mflo ($t_1,$a_6,$a_7) + mfhi ($t_2,$a_6,$a_7) ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); $code.=<<___; $ST $c_2,13*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 $ADDU $t_2,$at @@ -2145,19 +2168,19 @@ $code.=<<___; .set reorder $LD $a_0,0($a1) $LD $a_1,$BNSZ($a1) - $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); $LD $a_2,2*$BNSZ($a1) $LD $a_3,3*$BNSZ($a1) - mflo $c_1 - mfhi $c_2 + mflo ($c_1,$a_0,$a_0) + mfhi ($c_2,$a_0,$a_0) $ST $c_1,0($a0) - $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); - mflo $t_1 - mfhi $t_2 + $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo ($t_1,$a_0,$a_1) + mfhi ($t_2,$a_0,$a_1) slt $c_1,$t_2,$zero $SLL $t_2,1 - $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); slt $a2,$t_1,$zero $ADDU $t_2,$a2 $SLL $t_1,1 @@ -2165,20 +2188,22 @@ $code.=<<___; sltu $at,$c_2,$t_1 $ADDU $c_3,$t_2,$at $ST $c_2,$BNSZ($a0) + mflo ($t_1,$a_2,$a_0) + mfhi ($t_2,$a_2,$a_0) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_3,$t_1 sltu $at,$c_3,$t_1 - $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); $ADDU $t_2,$at $ADDU $c_1,$t_2 sltu $at,$c_1,$t_2 $ADDU $c_2,$at $ST $c_3,2*$BNSZ($a0) + mflo ($t_1,$a_0,$a_3) + mfhi ($t_2,$a_0,$a_3) ___ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); @@ -2190,24 +2215,22 @@ ___ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); $code.=<<___; - mflo $t_1 - mfhi $t_2 $ADDU $c_2,$t_1 sltu $at,$c_2,$t_1 - $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2); $ADDU $t_2,$at $ADDU $c_3,$t_2 sltu $at,$c_3,$t_2 $ADDU $c_1,$at $ST $c_2,4*$BNSZ($a0) + mflo ($t_1,$a_2,$a_3) + mfhi ($t_2,$a_2,$a_3) ___ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); $code.=<<___; $ST $c_3,5*$BNSZ($a0) - mflo $t_1 - mfhi $t_2 $ADDU $c_1,$t_1 sltu $at,$c_1,$t_1 $ADDU $t_2,$at diff --git a/crypto/bn/asm/mips3-mont.pl b/crypto/bn/asm/mips3-mont.pl deleted file mode 100755 index 8f9156e02af3..000000000000 --- a/crypto/bn/asm/mips3-mont.pl +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env perl -# -# ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== - -# This module doesn't present direct interest for OpenSSL, because it -# doesn't provide better performance for longer keys. While 512-bit -# RSA private key operations are 40% faster, 1024-bit ones are hardly -# faster at all, while longer key operations are slower by up to 20%. -# It might be of interest to embedded system developers though, as -# it's smaller than 1KB, yet offers ~3x improvement over compiler -# generated code. -# -# The module targets N32 and N64 MIPS ABIs and currently is a bit -# IRIX-centric, i.e. is likely to require adaptation for other OSes. - -# int bn_mul_mont( -$rp="a0"; # BN_ULONG *rp, -$ap="a1"; # const BN_ULONG *ap, -$bp="a2"; # const BN_ULONG *bp, -$np="a3"; # const BN_ULONG *np, -$n0="a4"; # const BN_ULONG *n0, -$num="a5"; # int num); - -$lo0="a6"; -$hi0="a7"; -$lo1="v0"; -$hi1="v1"; -$aj="t0"; -$bi="t1"; -$nj="t2"; -$tp="t3"; -$alo="s0"; -$ahi="s1"; -$nlo="s2"; -$nhi="s3"; -$tj="s4"; -$i="s5"; -$j="s6"; -$fp="t8"; -$m1="t9"; - -$FRAME=8*(2+8); - -$code=<<___; -#include <asm.h> -#include <regdef.h> - -.text - -.set noat -.set reorder - -.align 5 -.globl bn_mul_mont -.ent bn_mul_mont -bn_mul_mont: - .set noreorder - PTR_SUB sp,64 - move $fp,sp - .frame $fp,64,ra - slt AT,$num,4 - li v0,0 - beqzl AT,.Lproceed - nop - jr ra - PTR_ADD sp,$fp,64 - .set reorder -.align 5 -.Lproceed: - ld $n0,0($n0) - ld $bi,0($bp) # bp[0] - ld $aj,0($ap) # ap[0] - ld $nj,0($np) # np[0] - PTR_SUB sp,16 # place for two extra words - sll $num,3 - li AT,-4096 - PTR_SUB sp,$num - and sp,AT - - sd s0,0($fp) - sd s1,8($fp) - sd s2,16($fp) - sd s3,24($fp) - sd s4,32($fp) - sd s5,40($fp) - sd s6,48($fp) - sd s7,56($fp) - - dmultu $aj,$bi - ld $alo,8($ap) - ld $nlo,8($np) - mflo $lo0 - mfhi $hi0 - dmultu $lo0,$n0 - mflo $m1 - - dmultu $alo,$bi - mflo $alo - mfhi $ahi - - dmultu $nj,$m1 - mflo $lo1 - mfhi $hi1 - dmultu $nlo,$m1 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - mflo $nlo - mfhi $nhi - - move $tp,sp - li $j,16 -.align 4 -.L1st: - .set noreorder - PTR_ADD $aj,$ap,$j - ld $aj,($aj) - PTR_ADD $nj,$np,$j - ld $nj,($nj) - - dmultu $aj,$bi - daddu $lo0,$alo,$hi0 - daddu $lo1,$nlo,$hi1 - sltu AT,$lo0,$hi0 - sltu s7,$lo1,$hi1 - daddu $hi0,$ahi,AT - daddu $hi1,$nhi,s7 - mflo $alo - mfhi $ahi - - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - dmultu $nj,$m1 - daddu $hi1,AT - addu $j,8 - sd $lo1,($tp) - sltu s7,$j,$num - mflo $nlo - mfhi $nhi - - bnez s7,.L1st - PTR_ADD $tp,8 - .set reorder - - daddu $lo0,$alo,$hi0 - sltu AT,$lo0,$hi0 - daddu $hi0,$ahi,AT - - daddu $lo1,$nlo,$hi1 - sltu s7,$lo1,$hi1 - daddu $hi1,$nhi,s7 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - - sd $lo1,($tp) - - daddu $hi1,$hi0 - sltu AT,$hi1,$hi0 - sd $hi1,8($tp) - sd AT,16($tp) - - li $i,8 -.align 4 -.Louter: - PTR_ADD $bi,$bp,$i - ld $bi,($bi) - ld $aj,($ap) - ld $alo,8($ap) - ld $tj,(sp) - - dmultu $aj,$bi - ld $nj,($np) - ld $nlo,8($np) - mflo $lo0 - mfhi $hi0 - daddu $lo0,$tj - dmultu $lo0,$n0 - sltu AT,$lo0,$tj - daddu $hi0,AT - mflo $m1 - - dmultu $alo,$bi - mflo $alo - mfhi $ahi - - dmultu $nj,$m1 - mflo $lo1 - mfhi $hi1 - - dmultu $nlo,$m1 - daddu $lo1,$lo0 - sltu AT,$lo1,$lo0 - daddu $hi1,AT - mflo $nlo - mfhi $nhi - - move $tp,sp - li $j,16 - ld $tj,8($tp) -.align 4 -.Linner: - .set noreorder - PTR_ADD $aj,$ap,$j - ld $aj,($aj) - PTR_ADD $nj,$np,$j - ld $nj,($nj) - - dmultu $aj,$bi - daddu $lo0,$alo,$hi0 - daddu $lo1,$nlo,$hi1 - sltu AT,$lo0,$hi0 - sltu s7,$lo1,$hi1 - daddu $hi0,$ahi,AT - daddu $hi1,$nhi,s7 - mflo $alo - mfhi $ahi - - daddu $lo0,$tj - addu $j,8 - dmultu $nj,$m1 - sltu AT,$lo0,$tj - daddu $lo1,$lo0 - daddu $hi0,AT - sltu s7,$lo1,$lo0 - ld $tj,16($tp) - daddu $hi1,s7 - sltu AT,$j,$num - mflo $nlo - mfhi $nhi - sd $lo1,($tp) - bnez AT,.Linner - PTR_ADD $tp,8 - .set reorder - - daddu $lo0,$alo,$hi0 - sltu AT,$lo0,$hi0 - daddu $hi0,$ahi,AT - daddu $lo0,$tj - sltu s7,$lo0,$tj - daddu $hi0,s7 - - ld $tj,16($tp) - daddu $lo1,$nlo,$hi1 - sltu AT,$lo1,$hi1 - daddu $hi1,$nhi,AT - daddu $lo1,$lo0 - sltu s7,$lo1,$lo0 - daddu $hi1,s7 - sd $lo1,($tp) - - daddu $lo1,$hi1,$hi0 - sltu $hi1,$lo1,$hi0 - daddu $lo1,$tj - sltu AT,$lo1,$tj - daddu $hi1,AT - sd $lo1,8($tp) - sd $hi1,16($tp) - - addu $i,8 - sltu s7,$i,$num - bnez s7,.Louter - - .set noreorder - PTR_ADD $tj,sp,$num # &tp[num] - move $tp,sp - move $ap,sp - li $hi0,0 # clear borrow bit - -.align 4 -.Lsub: ld $lo0,($tp) - ld $lo1,($np) - PTR_ADD $tp,8 - PTR_ADD $np,8 - dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] - sgtu AT,$lo1,$lo0 - dsubu $lo0,$lo1,$hi0 - sgtu $hi0,$lo0,$lo1 - sd $lo0,($rp) - or $hi0,AT - sltu AT,$tp,$tj - bnez AT,.Lsub - PTR_ADD $rp,8 - - dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit - move $tp,sp - PTR_SUB $rp,$num # restore rp - not $hi1,$hi0 - - and $ap,$hi0,sp - and $bp,$hi1,$rp - or $ap,$ap,$bp # ap=borrow?tp:rp - -.align 4 -.Lcopy: ld $aj,($ap) - PTR_ADD $ap,8 - PTR_ADD $tp,8 - sd zero,-8($tp) - sltu AT,$tp,$tj - sd $aj,($rp) - bnez AT,.Lcopy - PTR_ADD $rp,8 - - ld s0,0($fp) - ld s1,8($fp) - ld s2,16($fp) - ld s3,24($fp) - ld s4,32($fp) - ld s5,40($fp) - ld s6,48($fp) - ld s7,56($fp) - li v0,1 - jr ra - PTR_ADD sp,$fp,64 - .set reorder -END(bn_mul_mont) -.rdata -.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>" -___ - -print $code; -close STDOUT; diff --git a/crypto/bn/asm/mips3.s b/crypto/bn/asm/mips3.s deleted file mode 100644 index dca4105c7db1..000000000000 --- a/crypto/bn/asm/mips3.s +++ /dev/null @@ -1,2201 +0,0 @@ -.rdata -.asciiz "mips3.s, Version 1.1" -.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" - -/* - * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. - * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. - * ==================================================================== - */ - -/* - * This is my modest contributon to the OpenSSL project (see - * http://www.openssl.org/ for more information about it) and is - * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c - * module. For updates see http://fy.chalmers.se/~appro/hpe/. - * - * The module is designed to work with either of the "new" MIPS ABI(5), - * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under - * IRIX 5.x not only because it doesn't support new ABIs but also - * because 5.x kernels put R4x00 CPU into 32-bit mode and all those - * 64-bit instructions (daddu, dmultu, etc.) found below gonna only - * cause illegal instruction exception:-( - * - * In addition the code depends on preprocessor flags set up by MIPSpro - * compiler driver (either as or cc) and therefore (probably?) can't be - * compiled by the GNU assembler. GNU C driver manages fine though... - * I mean as long as -mmips-as is specified or is the default option, - * because then it simply invokes /usr/bin/as which in turn takes - * perfect care of the preprocessor definitions. Another neat feature - * offered by the MIPSpro assembler is an optimization pass. This gave - * me the opportunity to have the code looking more regular as all those - * architecture dependent instruction rescheduling details were left to - * the assembler. Cool, huh? - * - * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' - * goes way over 3 times faster! - * - * <appro@fy.chalmers.se> - */ -#include <asm.h> -#include <regdef.h> - -#if _MIPS_ISA>=4 -#define MOVNZ(cond,dst,src) \ - movn dst,src,cond -#else -#define MOVNZ(cond,dst,src) \ - .set noreorder; \ - bnezl cond,.+8; \ - move dst,src; \ - .set reorder -#endif - -.text - -.set noat -.set reorder - -#define MINUS4 v1 - -.align 5 -LEAF(bn_mul_add_words) - .set noreorder - bgtzl a2,.L_bn_mul_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_add_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_add_words_tail - -.L_bn_mul_add_words_loop: - dmultu t0,a3 - ld t1,0(a0) - ld t2,8(a1) - ld t3,8(a0) - ld ta0,16(a1) - ld ta1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 /* All manuals say it "compares 32-bit - * values", but it seems to work fine - * even on 64-bit registers. */ - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - - dmultu t2,a3 - ld ta2,24(a1) - ld ta3,24(a0) - daddu t3,v0 - sltu v0,t3,v0 - mflo AT - mfhi t2 - daddu t3,AT - daddu v0,t2 - sltu AT,t3,AT - sd t3,8(a0) - daddu v0,AT - - dmultu ta0,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - daddu ta1,v0 - sltu v0,ta1,v0 - mflo AT - mfhi ta0 - daddu ta1,AT - daddu v0,ta0 - sltu AT,ta1,AT - sd ta1,-16(a0) - daddu v0,AT - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - daddu ta3,v0 - sltu v0,ta3,v0 - mflo AT - mfhi ta2 - daddu ta3,AT - daddu v0,ta2 - sltu AT,ta3,AT - sd ta3,-8(a0) - daddu v0,AT - .set noreorder - bgtzl ta0,.L_bn_mul_add_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_add_words_return: - jr ra - -.L_bn_mul_add_words_tail: - dmultu t0,a3 - ld t1,0(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,0(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,8(a1) - dmultu t0,a3 - ld t1,8(a0) - subu a2,1 - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,8(a0) - daddu v0,AT - beqz a2,.L_bn_mul_add_words_return - - ld t0,16(a1) - dmultu t0,a3 - ld t1,16(a0) - daddu t1,v0 - sltu v0,t1,v0 - mflo AT - mfhi t0 - daddu t1,AT - daddu v0,t0 - sltu AT,t1,AT - sd t1,16(a0) - daddu v0,AT - jr ra -END(bn_mul_add_words) - -.align 5 -LEAF(bn_mul_words) - .set noreorder - bgtzl a2,.L_bn_mul_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_mul_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_mul_words_tail - -.L_bn_mul_words_loop: - dmultu t0,a3 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - - dmultu t2,a3 - subu a2,4 - PTR_ADD a0,32 - PTR_ADD a1,32 - mflo AT - mfhi t2 - daddu v0,AT - sltu t3,v0,AT - sd v0,-24(a0) - daddu v0,t3,t2 - - dmultu ta0,a3 - mflo AT - mfhi ta0 - daddu v0,AT - sltu ta1,v0,AT - sd v0,-16(a0) - daddu v0,ta1,ta0 - - - dmultu ta2,a3 - and ta0,a2,MINUS4 - mflo AT - mfhi ta2 - daddu v0,AT - sltu ta3,v0,AT - sd v0,-8(a0) - daddu v0,ta3,ta2 - .set noreorder - bgtzl ta0,.L_bn_mul_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_mul_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_mul_words_return: - jr ra - -.L_bn_mul_words_tail: - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,0(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,8(a1) - dmultu t0,a3 - subu a2,1 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,8(a0) - daddu v0,t1,t0 - beqz a2,.L_bn_mul_words_return - - ld t0,16(a1) - dmultu t0,a3 - mflo AT - mfhi t0 - daddu v0,AT - sltu t1,v0,AT - sd v0,16(a0) - daddu v0,t1,t0 - jr ra -END(bn_mul_words) - -.align 5 -LEAF(bn_sqr_words) - .set noreorder - bgtzl a2,.L_bn_sqr_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sqr_words_proceed: - li MINUS4,-4 - and ta0,a2,MINUS4 - move v0,zero - beqz ta0,.L_bn_sqr_words_tail - -.L_bn_sqr_words_loop: - dmultu t0,t0 - ld t2,8(a1) - ld ta0,16(a1) - ld ta2,24(a1) - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - - dmultu t2,t2 - subu a2,4 - PTR_ADD a0,64 - PTR_ADD a1,32 - mflo t3 - mfhi t2 - sd t3,-48(a0) - sd t2,-40(a0) - - dmultu ta0,ta0 - mflo ta1 - mfhi ta0 - sd ta1,-32(a0) - sd ta0,-24(a0) - - - dmultu ta2,ta2 - and ta0,a2,MINUS4 - mflo ta3 - mfhi ta2 - sd ta3,-16(a0) - sd ta2,-8(a0) - - .set noreorder - bgtzl ta0,.L_bn_sqr_words_loop - ld t0,0(a1) - - bnezl a2,.L_bn_sqr_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sqr_words_return: - move v0,zero - jr ra - -.L_bn_sqr_words_tail: - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,0(a0) - sd t0,8(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,8(a1) - dmultu t0,t0 - subu a2,1 - mflo t1 - mfhi t0 - sd t1,16(a0) - sd t0,24(a0) - beqz a2,.L_bn_sqr_words_return - - ld t0,16(a1) - dmultu t0,t0 - mflo t1 - mfhi t0 - sd t1,32(a0) - sd t0,40(a0) - jr ra -END(bn_sqr_words) - -.align 5 -LEAF(bn_add_words) - .set noreorder - bgtzl a3,.L_bn_add_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_add_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_add_words_tail - -.L_bn_add_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - daddu ta0,t0 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,-32(a0) - daddu v0,t8 - - daddu ta1,t1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,-24(a0) - daddu v0,t9 - - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,-16(a0) - daddu v0,t8 - - daddu ta3,t3 - sltu t9,ta3,t3 - daddu t3,ta3,v0 - sltu v0,t3,ta3 - sd t3,-8(a0) - daddu v0,t9 - - .set noreorder - bgtzl AT,.L_bn_add_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_add_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_add_words_return: - jr ra - -.L_bn_add_words_tail: - ld ta0,0(a2) - daddu ta0,t0 - subu a3,1 - sltu t8,ta0,t0 - daddu t0,ta0,v0 - sltu v0,t0,ta0 - sd t0,0(a0) - daddu v0,t8 - beqz a3,.L_bn_add_words_return - - ld t1,8(a1) - ld ta1,8(a2) - daddu ta1,t1 - subu a3,1 - sltu t9,ta1,t1 - daddu t1,ta1,v0 - sltu v0,t1,ta1 - sd t1,8(a0) - daddu v0,t9 - beqz a3,.L_bn_add_words_return - - ld t2,16(a1) - ld ta2,16(a2) - daddu ta2,t2 - sltu t8,ta2,t2 - daddu t2,ta2,v0 - sltu v0,t2,ta2 - sd t2,16(a0) - daddu v0,t8 - jr ra -END(bn_add_words) - -.align 5 -LEAF(bn_sub_words) - .set noreorder - bgtzl a3,.L_bn_sub_words_proceed - ld t0,0(a1) - jr ra - move v0,zero - .set reorder - -.L_bn_sub_words_proceed: - li MINUS4,-4 - and AT,a3,MINUS4 - move v0,zero - beqz AT,.L_bn_sub_words_tail - -.L_bn_sub_words_loop: - ld ta0,0(a2) - subu a3,4 - ld t1,8(a1) - and AT,a3,MINUS4 - ld t2,16(a1) - PTR_ADD a2,32 - ld t3,24(a1) - PTR_ADD a0,32 - ld ta1,-24(a2) - PTR_ADD a1,32 - ld ta2,-16(a2) - ld ta3,-8(a2) - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - sd ta0,-32(a0) - MOVNZ (t0,v0,t8) - - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - sd ta1,-24(a0) - MOVNZ (t1,v0,t9) - - - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - sd ta2,-16(a0) - MOVNZ (t2,v0,t8) - - sltu t9,t3,ta3 - dsubu t3,ta3 - dsubu ta3,t3,v0 - sd ta3,-8(a0) - MOVNZ (t3,v0,t9) - - .set noreorder - bgtzl AT,.L_bn_sub_words_loop - ld t0,0(a1) - - bnezl a3,.L_bn_sub_words_tail - ld t0,0(a1) - .set reorder - -.L_bn_sub_words_return: - jr ra - -.L_bn_sub_words_tail: - ld ta0,0(a2) - subu a3,1 - sltu t8,t0,ta0 - dsubu t0,ta0 - dsubu ta0,t0,v0 - MOVNZ (t0,v0,t8) - sd ta0,0(a0) - beqz a3,.L_bn_sub_words_return - - ld t1,8(a1) - subu a3,1 - ld ta1,8(a2) - sltu t9,t1,ta1 - dsubu t1,ta1 - dsubu ta1,t1,v0 - MOVNZ (t1,v0,t9) - sd ta1,8(a0) - beqz a3,.L_bn_sub_words_return - - ld t2,16(a1) - ld ta2,16(a2) - sltu t8,t2,ta2 - dsubu t2,ta2 - dsubu ta2,t2,v0 - MOVNZ (t2,v0,t8) - sd ta2,16(a0) - jr ra -END(bn_sub_words) - -#undef MINUS4 - -.align 5 -LEAF(bn_div_3_words) - .set reorder - move a3,a0 /* we know that bn_div_words doesn't - * touch a3, ta2, ta3 and preserves a2 - * so that we can save two arguments - * and return address in registers - * instead of stack:-) - */ - ld a0,(a3) - move ta2,a1 - ld a1,-8(a3) - bne a0,a2,.L_bn_div_3_words_proceed - li v0,-1 - jr ra -.L_bn_div_3_words_proceed: - move ta3,ra - bal bn_div_words - move ra,ta3 - dmultu ta2,v0 - ld t2,-16(a3) - move ta0,zero - mfhi t1 - mflo t0 - sltu t8,t1,v1 -.L_bn_div_3_words_inner_loop: - bnez t8,.L_bn_div_3_words_inner_loop_done - sgeu AT,t2,t0 - seq t9,t1,v1 - and AT,t9 - sltu t3,t0,ta2 - daddu v1,a2 - dsubu t1,t3 - dsubu t0,ta2 - sltu t8,t1,v1 - sltu ta0,v1,a2 - or t8,ta0 - .set noreorder - beqzl AT,.L_bn_div_3_words_inner_loop - dsubu v0,1 - .set reorder -.L_bn_div_3_words_inner_loop_done: - jr ra -END(bn_div_3_words) - -.align 5 -LEAF(bn_div_words) - .set noreorder - bnezl a2,.L_bn_div_words_proceed - move v1,zero - jr ra - li v0,-1 /* I'd rather signal div-by-zero - * which can be done with 'break 7' */ - -.L_bn_div_words_proceed: - bltz a2,.L_bn_div_words_body - move t9,v1 - dsll a2,1 - bgtz a2,.-4 - addu t9,1 - - .set reorder - negu t1,t9 - li t2,-1 - dsll t2,t1 - and t2,a0 - dsrl AT,a1,t1 - .set noreorder - bnezl t2,.+8 - break 6 /* signal overflow */ - .set reorder - dsll a0,t9 - dsll a1,t9 - or a0,AT - -#define QT ta0 -#define HH ta1 -#define DH v1 -.L_bn_div_words_body: - dsrl DH,a2,32 - sgeu AT,a0,a2 - .set noreorder - bnezl AT,.+8 - dsubu a0,a2 - .set reorder - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div1 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div1: - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop1: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v0,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop1_done - dsubu t1,v0 - dsubu t0,a2 - b .L_bn_div_words_inner_loop1 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop1_done: - - dsll a1,32 - dsubu a0,t3,t0 - dsll v0,QT,32 - - li QT,-1 - dsrl HH,a0,32 - dsrl QT,32 /* q=0xffffffff */ - beq DH,HH,.L_bn_div_words_skip_div2 - ddivu zero,a0,DH - mflo QT -.L_bn_div_words_skip_div2: -#undef DH - dmultu a2,QT - dsll t3,a0,32 - dsrl AT,a1,32 - or t3,AT - mflo t0 - mfhi t1 -.L_bn_div_words_inner_loop2: - sltu t2,t3,t0 - seq t8,HH,t1 - sltu AT,HH,t1 - and t2,t8 - sltu v1,t0,a2 - or AT,t2 - .set noreorder - beqz AT,.L_bn_div_words_inner_loop2_done - dsubu t1,v1 - dsubu t0,a2 - b .L_bn_div_words_inner_loop2 - dsubu QT,1 - .set reorder -.L_bn_div_words_inner_loop2_done: -#undef HH - - dsubu a0,t3,t0 - or v0,QT - dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ - dsrl a2,t9 /* restore a2 */ - jr ra -#undef QT -END(bn_div_words) - -#define a_0 t0 -#define a_1 t1 -#define a_2 t2 -#define a_3 t3 -#define b_0 ta0 -#define b_1 ta1 -#define b_2 ta2 -#define b_3 ta3 - -#define a_4 s0 -#define a_5 s2 -#define a_6 s4 -#define a_7 a1 /* once we load a[7] we don't need a anymore */ -#define b_4 s1 -#define b_5 s3 -#define b_6 s5 -#define b_7 a2 /* once we load b[7] we don't need b anymore */ - -#define t_1 t8 -#define t_2 t9 - -#define c_1 v0 -#define c_2 v1 -#define c_3 a3 - -#define FRAME_SIZE 48 - -.align 5 -LEAF(bn_mul_comba8) - .set noreorder - PTR_SUB sp,FRAME_SIZE - .frame sp,64,ra - .set reorder - ld a_0,0(a1) /* If compiled with -mips3 option on - * R5000 box assembler barks on this - * line with "shouldn't have mult/div - * as last instruction in bb (R10K - * bug)" warning. If anybody out there - * has a clue about how to circumvent - * this do send me a note. - * <appro@fy.chalmers.se> - */ - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - sd s0,0(sp) - sd s1,8(sp) - sd s2,16(sp) - sd s3,24(sp) - sd s4,32(sp) - sd s5,40(sp) - mflo c_1 - mfhi c_2 - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - ld b_4,32(a2) - ld b_5,40(a2) - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - ld b_6,48(a2) - ld b_7,56(a2) - sd c_1,0(a0) /* r[0]=c1; */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) /* r[1]=c2; */ - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) /* r[2]=c3; */ - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) /* r[3]=c1; */ - - dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) /* r[4]=c2; */ - - dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) /* r[5]=c3; */ - - dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) /* r[6]=c1; */ - - dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) /* r[7]=c2; */ - - dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) /* r[8]=c3; */ - - dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) /* r[9]=c1; */ - - dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) /* r[10]=c2; */ - - dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) /* r[11]=c3; */ - - dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) /* r[12]=c1; */ - - dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) /* r[13]=c2; */ - - dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - ld s0,0(sp) - ld s1,8(sp) - ld s2,16(sp) - ld s3,24(sp) - ld s4,32(sp) - ld s5,40(sp) - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) /* r[14]=c3; */ - sd c_1,120(a0) /* r[15]=c1; */ - - PTR_ADD sp,FRAME_SIZE - - jr ra -END(bn_mul_comba8) - -.align 5 -LEAF(bn_mul_comba4) - .set reorder - ld a_0,0(a1) - ld b_0,0(a2) - ld a_1,8(a1) - ld a_2,16(a1) - dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_3,24(a1) - ld b_1,8(a2) - ld b_2,16(a2) - ld b_3,24(a2) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - sd c_2,8(a0) - - dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu c_3,c_2,t_2 - dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu c_1,c_3,t_2 - dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu c_2,c_1,t_2 - dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_mul_comba4) - -#undef a_4 -#undef a_5 -#undef a_6 -#undef a_7 -#define a_4 b_0 -#define a_5 b_1 -#define a_6 b_2 -#define a_7 b_3 - -.align 5 -LEAF(bn_sqr_comba8) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - ld a_4,32(a1) - ld a_5,40(a1) - ld a_6,48(a1) - ld a_7,56(a1) - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,48(a0) - - dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,56(a0) - - dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,64(a0) - - dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,72(a0) - - dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_1,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,80(a0) - - dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_2,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,88(a0) - - dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,96(a0) - - dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,104(a0) - - dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sd c_3,112(a0) - sd c_1,120(a0) - - jr ra -END(bn_sqr_comba8) - -.align 5 -LEAF(bn_sqr_comba4) - .set reorder - ld a_0,0(a1) - ld a_1,8(a1) - ld a_2,16(a1) - ld a_3,24(a1) - dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ - mflo c_1 - mfhi c_2 - sd c_1,0(a0) - - dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu c_3,t_2,AT - sd c_2,8(a0) - - dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,16(a0) - - dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt c_3,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - slt AT,t_2,zero - daddu c_3,AT - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sltu AT,c_2,t_2 - daddu c_3,AT - sd c_1,24(a0) - - dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - slt c_1,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ - mflo t_1 - mfhi t_2 - daddu c_2,t_1 - sltu AT,c_2,t_1 - daddu t_2,AT - daddu c_3,t_2 - sltu AT,c_3,t_2 - daddu c_1,AT - sd c_2,32(a0) - - dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ - mflo t_1 - mfhi t_2 - slt c_2,t_2,zero - dsll t_2,1 - slt a2,t_1,zero - daddu t_2,a2 - dsll t_1,1 - daddu c_3,t_1 - sltu AT,c_3,t_1 - daddu t_2,AT - daddu c_1,t_2 - sltu AT,c_1,t_2 - daddu c_2,AT - sd c_3,40(a0) - - dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ - mflo t_1 - mfhi t_2 - daddu c_1,t_1 - sltu AT,c_1,t_1 - daddu t_2,AT - daddu c_2,t_2 - sd c_1,48(a0) - sd c_2,56(a0) - - jr ra -END(bn_sqr_comba4) diff --git a/crypto/bn/asm/pa-risc2.s b/crypto/bn/asm/pa-risc2.s deleted file mode 100644 index f3b16290eb04..000000000000 --- a/crypto/bn/asm/pa-risc2.s +++ /dev/null @@ -1,1618 +0,0 @@ -; -; PA-RISC 2.0 implementation of bn_asm code, based on the -; 64-bit version of the code. This code is effectively the -; same as the 64-bit version except the register model is -; slightly different given all values must be 32-bit between -; function calls. Thus the 64-bit return values are returned -; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit -; -; -; This code is approximately 2x faster than the C version -; for RSA/DSA. -; -; See http://devresource.hp.com/ for more details on the PA-RISC -; architecture. Also see the book "PA-RISC 2.0 Architecture" -; by Gerry Kane for information on the instruction set architecture. -; -; Code written by Chris Ruemmler (with some help from the HP C -; compiler). -; -; The code compiles with HP's assembler -; - - .level 2.0N - .space $TEXT$ - .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY - -; -; Global Register definitions used for the routines. -; -; Some information about HP's runtime architecture for 32-bits. -; -; "Caller save" means the calling function must save the register -; if it wants the register to be preserved. -; "Callee save" means if a function uses the register, it must save -; the value before using it. -; -; For the floating point registers -; -; "caller save" registers: fr4-fr11, fr22-fr31 -; "callee save" registers: fr12-fr21 -; "special" registers: fr0-fr3 (status and exception registers) -; -; For the integer registers -; value zero : r0 -; "caller save" registers: r1,r19-r26 -; "callee save" registers: r3-r18 -; return register : r2 (rp) -; return values ; r28,r29 (ret0,ret1) -; Stack pointer ; r30 (sp) -; millicode return ptr ; r31 (also a caller save register) - - -; -; Arguments to the routines -; -r_ptr .reg %r26 -a_ptr .reg %r25 -b_ptr .reg %r24 -num .reg %r24 -n .reg %r23 - -; -; Note that the "w" argument for bn_mul_add_words and bn_mul_words -; is passed on the stack at a delta of -56 from the top of stack -; as the routine is entered. -; - -; -; Globals used in some routines -; - -top_overflow .reg %r23 -high_mask .reg %r22 ; value 0xffffffff80000000L - - -;------------------------------------------------------------------------------ -; -; bn_mul_add_words -; -;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, -; int num, BN_ULONG w) -; -; arg0 = r_ptr -; arg1 = a_ptr -; arg3 = num -; -56(sp) = w -; -; Local register definitions -; - -fm1 .reg %fr22 -fm .reg %fr23 -ht_temp .reg %fr24 -ht_temp_1 .reg %fr25 -lt_temp .reg %fr26 -lt_temp_1 .reg %fr27 -fm1_1 .reg %fr28 -fm_1 .reg %fr29 - -fw_h .reg %fr7L -fw_l .reg %fr7R -fw .reg %fr7 - -fht_0 .reg %fr8L -flt_0 .reg %fr8R -t_float_0 .reg %fr8 - -fht_1 .reg %fr9L -flt_1 .reg %fr9R -t_float_1 .reg %fr9 - -tmp_0 .reg %r31 -tmp_1 .reg %r21 -m_0 .reg %r20 -m_1 .reg %r19 -ht_0 .reg %r1 -ht_1 .reg %r3 -lt_0 .reg %r4 -lt_1 .reg %r5 -m1_0 .reg %r6 -m1_1 .reg %r7 -rp_val .reg %r8 -rp_val_1 .reg %r9 - -bn_mul_add_words - .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN - .proc - .callinfo frame=128 - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP ; Needed to make the loop 16-byte aligned - NOP ; needed to make the loop 16-byte aligned - - STD %r5,16(%sp) ; save r5 - NOP - STD %r6,24(%sp) ; save r6 - STD %r7,32(%sp) ; save r7 - - STD %r8,40(%sp) ; save r8 - STD %r9,48(%sp) ; save r9 - COPY %r0,%ret1 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - - CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit - LDO 128(%sp),%sp ; bump stack - - ; - ; The loop is unrolled twice, so if there is only 1 number - ; then go straight to the cleanup code. - ; - CMPIB,= 1,num,bn_mul_add_words_single_top - FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_add_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDD 8(r_ptr),rp_val_1 ; rp[1] - - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1[0] - FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] - - XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h - XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m[0] - FSTD fm_1,-40(%sp) ; -40(sp) = m[1] - - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 - - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 - - LDD -8(%sp),m_0 ; m[0] - LDD -40(%sp),m_1 ; m[1] - LDD -16(%sp),m1_0 ; m1[0] - LDD -48(%sp),m1_1 ; m1[1] - - LDD -24(%sp),ht_0 ; ht[0] - LDD -56(%sp),ht_1 ; ht[1] - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; - - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) - ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) - - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) - ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) - EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 - - EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 - ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) - ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) - - ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - - ADD %ret1,lt_0,lt_0 ; lt[0] = lt[0] + c; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - - LDO -2(num),num ; num = num - 2; - ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - - ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] - ADD,DC ht_1,%r0,%ret1 ; ht[1]++ - LDO 16(a_ptr),a_ptr ; a_ptr += 2 - - STD lt_1,8(r_ptr) ; rp[1] = lt[1] - CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do - LDO 16(r_ptr),r_ptr ; r_ptr += 2 - - CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_add_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDO 8(a_ptr),a_ptr ; a_ptr++ - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 ; m1 = temp1 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD %ret1,tmp_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] - ADD,DC ht_0,%r0,%ret1 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_add_words_exit - .EXIT - - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - LDD -80(%sp),%r9 ; restore r9 - LDD -88(%sp),%r8 ; restore r8 - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) -; -; arg0 = rp -; arg1 = ap -; arg3 = num -; w on stack at -56(sp) - -bn_mul_words - .proc - .callinfo frame=128 - .entry - .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP - STD %r5,16(%sp) ; save r5 - - STD %r6,24(%sp) ; save r6 - STD %r7,32(%sp) ; save r7 - COPY %r0,%ret1 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - - CMPIB,>= 0,num,bn_mul_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; See if only 1 word to do, thus just do cleanup - ; - CMPIB,= 1,num,bn_mul_words_single_top - FLDD -184(%sp),fw ; (-56-128) load up w into fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l - - FSTD fm1,-16(%sp) ; -16(sp) = m1 - FSTD fm1_1,-48(%sp) ; -48(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h - - FSTD fm,-8(%sp) ; -8(sp) = m - FSTD fm_1,-40(%sp) ; -40(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h - - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt - LDD -8(%sp),m_0 - LDD -40(%sp),m_1 - - LDD -16(%sp),m1_0 - LDD -48(%sp),m1_1 - LDD -24(%sp),ht_0 - LDD -56(%sp),ht_1 - - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) - ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - EXTRD,U tmp_1,31,32,m_1 ; m>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD lt_1,m1_1,lt_1 ; lt = lt+m1; - ADD,DC ht_1,%r0,ht_1 ; ht++ - ADD %ret1,lt_0,lt_0 ; lt = lt + c (ret1); - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) - ADD,DC ht_1,%r0,ht_1 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - STD lt_1,8(r_ptr) ; rp[1] = lt - - COPY ht_1,%ret1 ; carry = ht - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_mul_words_unroll2 - LDO 16(r_ptr),r_ptr ; rp++ - - CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt= lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD %ret1,lt_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - COPY ht_0,%ret1 ; copy carry - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_words_exit - .EXIT - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND - -;---------------------------------------------------------------------------- -; -;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) -; -; arg0 = rp -; arg1 = ap -; arg2 = num -; - -bn_sqr_words - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP - STD %r5,16(%sp) ; save r5 - - CMPIB,>= 0,num,bn_sqr_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; If only 1, the goto straight to cleanup - ; - CMPIB,= 1,num,bn_sqr_words_single_top - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - -bn_sqr_words_unroll2 - FLDD 0(a_ptr),t_float_0 ; a[0] - FLDD 8(a_ptr),t_float_1 ; a[1] - XMPYU fht_0,flt_0,fm ; m[0] - XMPYU fht_1,flt_1,fm_1 ; m[1] - - FSTD fm,-24(%sp) ; store m[0] - FSTD fm_1,-56(%sp) ; store m[1] - XMPYU flt_0,flt_0,lt_temp ; lt[0] - XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] - - FSTD lt_temp,-16(%sp) ; store lt[0] - FSTD lt_temp_1,-48(%sp) ; store lt[1] - XMPYU fht_0,fht_0,ht_temp ; ht[0] - XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] - - FSTD ht_temp,-8(%sp) ; store ht[0] - FSTD ht_temp_1,-40(%sp) ; store ht[1] - LDD -24(%sp),m_0 - LDD -56(%sp),m_1 - - AND m_0,high_mask,tmp_0 ; m[0] & Mask - AND m_1,high_mask,tmp_1 ; m[1] & Mask - DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 - DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 - - LDD -16(%sp),lt_0 - LDD -48(%sp),lt_1 - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 - EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 - - LDD -8(%sp),ht_0 - LDD -40(%sp),ht_1 - ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 - ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 - - ADD lt_0,m_0,lt_0 ; lt = lt+m - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - STD ht_0,8(r_ptr) ; rp[1] = ht[1] - - ADD lt_1,m_1,lt_1 ; lt = lt+m - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_1,16(r_ptr) ; rp[2] = lt[1] - STD ht_1,24(r_ptr) ; rp[3] = ht[1] - - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_sqr_words_unroll2 - LDO 32(r_ptr),r_ptr ; rp += 4 - - CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_sqr_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,flt_0,fm ; m - FSTD fm,-24(%sp) ; store m - - XMPYU flt_0,flt_0,lt_temp ; lt - FSTD lt_temp,-16(%sp) ; store lt - - XMPYU fht_0,fht_0,ht_temp ; ht - FSTD ht_temp,-8(%sp) ; store ht - - LDD -24(%sp),m_0 ; load m - AND m_0,high_mask,tmp_0 ; m & Mask - DEPD,Z m_0,30,31,m_0 ; m << 32+1 - LDD -16(%sp),lt_0 ; lt - - LDD -8(%sp),ht_0 ; ht - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 - ADD m_0,lt_0,lt_0 ; lt = lt+m - ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 - ADD,DC ht_0,%r0,ht_0 ; ht++ - - STD lt_0,0(r_ptr) ; rp[0] = lt - STD ht_0,8(r_ptr) ; rp[1] = ht - -bn_sqr_words_exit - .EXIT - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - .PROCEND ;in=23,24,25,26,29;out=28; - - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t .reg %r22 -b .reg %r21 -l .reg %r20 - -bn_add_words - .proc - .entry - .callinfo - .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - CMPIB,>= 0,n,bn_add_words_exit - COPY %r0,%ret1 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_add_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_add_words_unroll2 - LDD 0(a_ptr),t - LDD 0(b_ptr),b - ADD t,%ret1,t ; t = t+c; - ADD,DC %r0,%r0,%ret1 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret1,%r0,%ret1 ; c+= carry - STD l,0(r_ptr) - - LDD 8(a_ptr),t - LDD 8(b_ptr),b - ADD t,%ret1,t ; t = t+c; - ADD,DC %r0,%r0,%ret1 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret1,%r0,%ret1 ; c+= carry - STD l,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_add_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_add_words_exit ; are we done? - -bn_add_words_single_top - LDD 0(a_ptr),t - LDD 0(b_ptr),b - - ADD t,%ret1,t ; t = t+c; - ADD,DC %r0,%r0,%ret1 ; set c to carry (could use CMPCLR??) - ADD t,b,l ; l = t + b[0] - ADD,DC %ret1,%r0,%ret1 ; c+= carry - STD l,0(r_ptr) - -bn_add_words_exit - .EXIT - BVE (%rp) - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t1 .reg %r22 -t2 .reg %r21 -sub_tmp1 .reg %r20 -sub_tmp2 .reg %r19 - - -bn_sub_words - .proc - .callinfo - .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - CMPIB,>= 0,n,bn_sub_words_exit - COPY %r0,%ret1 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_sub_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_sub_words_unroll2 - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; - - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret1 - STD sub_tmp1,0(r_ptr) - - LDD 8(a_ptr),t1 - LDD 8(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret1 - STD sub_tmp1,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_sub_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? - -bn_sub_words_single_top - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret1,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret1 - - STD sub_tmp1,0(r_ptr) - -bn_sub_words_exit - .EXIT - BVE (%rp) - EXTRD,U %ret1,31,32,%ret0 ; for 32-bit, return in ret0/ret1 - .PROCEND ;in=23,24,25,26,29;out=28; - -;------------------------------------------------------------------------------ -; -; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) -; -; arg0 = h -; arg1 = l -; arg2 = d -; -; This is mainly just output from the HP C compiler. -; -;------------------------------------------------------------------------------ -bn_div_words - .PROC - .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN - .IMPORT BN_num_bits_word,CODE - ;--- not PIC .IMPORT __iob,DATA - ;--- not PIC .IMPORT fprintf,CODE - .IMPORT abort,CODE - .IMPORT $$div2U,MILLICODE - .CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE - .ENTRY - STW %r2,-20(%r30) ;offset 0x8ec - STW,MA %r3,192(%r30) ;offset 0x8f0 - STW %r4,-188(%r30) ;offset 0x8f4 - DEPD %r5,31,32,%r6 ;offset 0x8f8 - STD %r6,-184(%r30) ;offset 0x8fc - DEPD %r7,31,32,%r8 ;offset 0x900 - STD %r8,-176(%r30) ;offset 0x904 - STW %r9,-168(%r30) ;offset 0x908 - LDD -248(%r30),%r3 ;offset 0x90c - COPY %r26,%r4 ;offset 0x910 - COPY %r24,%r5 ;offset 0x914 - DEPD %r25,31,32,%r4 ;offset 0x918 - CMPB,*<> %r3,%r0,$0006000C ;offset 0x91c - DEPD %r23,31,32,%r5 ;offset 0x920 - MOVIB,TR -1,%r29,$00060002 ;offset 0x924 - EXTRD,U %r29,31,32,%r28 ;offset 0x928 -$0006002A - LDO -1(%r29),%r29 ;offset 0x92c - SUB %r23,%r7,%r23 ;offset 0x930 -$00060024 - SUB %r4,%r31,%r25 ;offset 0x934 - AND %r25,%r19,%r26 ;offset 0x938 - CMPB,*<>,N %r0,%r26,$00060046 ;offset 0x93c - DEPD,Z %r25,31,32,%r20 ;offset 0x940 - OR %r20,%r24,%r21 ;offset 0x944 - CMPB,*<<,N %r21,%r23,$0006002A ;offset 0x948 - SUB %r31,%r2,%r31 ;offset 0x94c -$00060046 -$0006002E - DEPD,Z %r23,31,32,%r25 ;offset 0x950 - EXTRD,U %r23,31,32,%r26 ;offset 0x954 - AND %r25,%r19,%r24 ;offset 0x958 - ADD,L %r31,%r26,%r31 ;offset 0x95c - CMPCLR,*>>= %r5,%r24,%r0 ;offset 0x960 - LDO 1(%r31),%r31 ;offset 0x964 -$00060032 - CMPB,*<<=,N %r31,%r4,$00060036 ;offset 0x968 - LDO -1(%r29),%r29 ;offset 0x96c - ADD,L %r4,%r3,%r4 ;offset 0x970 -$00060036 - ADDIB,=,N -1,%r8,$D0 ;offset 0x974 - SUB %r5,%r24,%r28 ;offset 0x978 -$0006003A - SUB %r4,%r31,%r24 ;offset 0x97c - SHRPD %r24,%r28,32,%r4 ;offset 0x980 - DEPD,Z %r29,31,32,%r9 ;offset 0x984 - DEPD,Z %r28,31,32,%r5 ;offset 0x988 -$0006001C - EXTRD,U %r4,31,32,%r31 ;offset 0x98c - CMPB,*<>,N %r31,%r2,$00060020 ;offset 0x990 - MOVB,TR %r6,%r29,$D1 ;offset 0x994 - STD %r29,-152(%r30) ;offset 0x998 -$0006000C - EXTRD,U %r3,31,32,%r25 ;offset 0x99c - COPY %r3,%r26 ;offset 0x9a0 - EXTRD,U %r3,31,32,%r9 ;offset 0x9a4 - EXTRD,U %r4,31,32,%r8 ;offset 0x9a8 - .CALL ARGW0=GR,ARGW1=GR,RTNVAL=GR ;in=25,26;out=28; - B,L BN_num_bits_word,%r2 ;offset 0x9ac - EXTRD,U %r5,31,32,%r7 ;offset 0x9b0 - LDI 64,%r20 ;offset 0x9b4 - DEPD %r7,31,32,%r5 ;offset 0x9b8 - DEPD %r8,31,32,%r4 ;offset 0x9bc - DEPD %r9,31,32,%r3 ;offset 0x9c0 - CMPB,= %r28,%r20,$00060012 ;offset 0x9c4 - COPY %r28,%r24 ;offset 0x9c8 - MTSARCM %r24 ;offset 0x9cc - DEPDI,Z -1,%sar,1,%r19 ;offset 0x9d0 - CMPB,*>>,N %r4,%r19,$D2 ;offset 0x9d4 -$00060012 - SUBI 64,%r24,%r31 ;offset 0x9d8 - CMPCLR,*<< %r4,%r3,%r0 ;offset 0x9dc - SUB %r4,%r3,%r4 ;offset 0x9e0 -$00060016 - CMPB,= %r31,%r0,$0006001A ;offset 0x9e4 - COPY %r0,%r9 ;offset 0x9e8 - MTSARCM %r31 ;offset 0x9ec - DEPD,Z %r3,%sar,64,%r3 ;offset 0x9f0 - SUBI 64,%r31,%r26 ;offset 0x9f4 - MTSAR %r26 ;offset 0x9f8 - SHRPD %r4,%r5,%sar,%r4 ;offset 0x9fc - MTSARCM %r31 ;offset 0xa00 - DEPD,Z %r5,%sar,64,%r5 ;offset 0xa04 -$0006001A - DEPDI,Z -1,31,32,%r19 ;offset 0xa08 - AND %r3,%r19,%r29 ;offset 0xa0c - EXTRD,U %r29,31,32,%r2 ;offset 0xa10 - DEPDI,Z -1,63,32,%r6 ;offset 0xa14 - MOVIB,TR 2,%r8,$0006001C ;offset 0xa18 - EXTRD,U %r3,63,32,%r7 ;offset 0xa1c -$D2 - ;--- not PIC ADDIL LR'__iob-$global$,%r27,%r1 ;offset 0xa20 - ;--- not PIC LDIL LR'C$7,%r21 ;offset 0xa24 - ;--- not PIC LDO RR'__iob-$global$+32(%r1),%r26 ;offset 0xa28 - ;--- not PIC .CALL ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR ;in=24,25,26;out=28; - ;--- not PIC B,L fprintf,%r2 ;offset 0xa2c - ;--- not PIC LDO RR'C$7(%r21),%r25 ;offset 0xa30 - .CALL ; - B,L abort,%r2 ;offset 0xa34 - NOP ;offset 0xa38 - B $D3 ;offset 0xa3c - LDW -212(%r30),%r2 ;offset 0xa40 -$00060020 - COPY %r4,%r26 ;offset 0xa44 - EXTRD,U %r4,31,32,%r25 ;offset 0xa48 - COPY %r2,%r24 ;offset 0xa4c - .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) - B,L $$div2U,%r31 ;offset 0xa50 - EXTRD,U %r2,31,32,%r23 ;offset 0xa54 - DEPD %r28,31,32,%r29 ;offset 0xa58 -$00060022 - STD %r29,-152(%r30) ;offset 0xa5c -$D1 - AND %r5,%r19,%r24 ;offset 0xa60 - EXTRD,U %r24,31,32,%r24 ;offset 0xa64 - STW %r2,-160(%r30) ;offset 0xa68 - STW %r7,-128(%r30) ;offset 0xa6c - FLDD -152(%r30),%fr4 ;offset 0xa70 - FLDD -152(%r30),%fr7 ;offset 0xa74 - FLDW -160(%r30),%fr8L ;offset 0xa78 - FLDW -128(%r30),%fr5L ;offset 0xa7c - XMPYU %fr8L,%fr7L,%fr10 ;offset 0xa80 - FSTD %fr10,-136(%r30) ;offset 0xa84 - XMPYU %fr8L,%fr7R,%fr22 ;offset 0xa88 - FSTD %fr22,-144(%r30) ;offset 0xa8c - XMPYU %fr5L,%fr4L,%fr11 ;offset 0xa90 - XMPYU %fr5L,%fr4R,%fr23 ;offset 0xa94 - FSTD %fr11,-112(%r30) ;offset 0xa98 - FSTD %fr23,-120(%r30) ;offset 0xa9c - LDD -136(%r30),%r28 ;offset 0xaa0 - DEPD,Z %r28,31,32,%r31 ;offset 0xaa4 - LDD -144(%r30),%r20 ;offset 0xaa8 - ADD,L %r20,%r31,%r31 ;offset 0xaac - LDD -112(%r30),%r22 ;offset 0xab0 - DEPD,Z %r22,31,32,%r22 ;offset 0xab4 - LDD -120(%r30),%r21 ;offset 0xab8 - B $00060024 ;offset 0xabc - ADD,L %r21,%r22,%r23 ;offset 0xac0 -$D0 - OR %r9,%r29,%r29 ;offset 0xac4 -$00060040 - EXTRD,U %r29,31,32,%r28 ;offset 0xac8 -$00060002 -$L2 - LDW -212(%r30),%r2 ;offset 0xacc -$D3 - LDW -168(%r30),%r9 ;offset 0xad0 - LDD -176(%r30),%r8 ;offset 0xad4 - EXTRD,U %r8,31,32,%r7 ;offset 0xad8 - LDD -184(%r30),%r6 ;offset 0xadc - EXTRD,U %r6,31,32,%r5 ;offset 0xae0 - LDW -188(%r30),%r4 ;offset 0xae4 - BVE (%r2) ;offset 0xae8 - .EXIT - LDW,MB -192(%r30),%r3 ;offset 0xaec - .PROCEND ;in=23,25;out=28,29;fpin=105,107; - - - - -;---------------------------------------------------------------------------- -; -; Registers to hold 64-bit values to manipulate. The "L" part -; of the register corresponds to the upper 32-bits, while the "R" -; part corresponds to the lower 32-bits -; -; Note, that when using b6 and b7, the code must save these before -; using them because they are callee save registers -; -; -; Floating point registers to use to save values that -; are manipulated. These don't collide with ftemp1-6 and -; are all caller save registers -; -a0 .reg %fr22 -a0L .reg %fr22L -a0R .reg %fr22R - -a1 .reg %fr23 -a1L .reg %fr23L -a1R .reg %fr23R - -a2 .reg %fr24 -a2L .reg %fr24L -a2R .reg %fr24R - -a3 .reg %fr25 -a3L .reg %fr25L -a3R .reg %fr25R - -a4 .reg %fr26 -a4L .reg %fr26L -a4R .reg %fr26R - -a5 .reg %fr27 -a5L .reg %fr27L -a5R .reg %fr27R - -a6 .reg %fr28 -a6L .reg %fr28L -a6R .reg %fr28R - -a7 .reg %fr29 -a7L .reg %fr29L -a7R .reg %fr29R - -b0 .reg %fr30 -b0L .reg %fr30L -b0R .reg %fr30R - -b1 .reg %fr31 -b1L .reg %fr31L -b1R .reg %fr31R - -; -; Temporary floating point variables, these are all caller save -; registers -; -ftemp1 .reg %fr4 -ftemp2 .reg %fr5 -ftemp3 .reg %fr6 -ftemp4 .reg %fr7 - -; -; The B set of registers when used. -; - -b2 .reg %fr8 -b2L .reg %fr8L -b2R .reg %fr8R - -b3 .reg %fr9 -b3L .reg %fr9L -b3R .reg %fr9R - -b4 .reg %fr10 -b4L .reg %fr10L -b4R .reg %fr10R - -b5 .reg %fr11 -b5L .reg %fr11L -b5R .reg %fr11R - -b6 .reg %fr12 -b6L .reg %fr12L -b6R .reg %fr12R - -b7 .reg %fr13 -b7L .reg %fr13L -b7R .reg %fr13R - -c1 .reg %r21 ; only reg -temp1 .reg %r20 ; only reg -temp2 .reg %r19 ; only reg -temp3 .reg %r31 ; only reg - -m1 .reg %r28 -c2 .reg %r23 -high_one .reg %r1 -ht .reg %r6 -lt .reg %r5 -m .reg %r4 -c3 .reg %r3 - -SQR_ADD_C .macro A0L,A0R,C1,C2,C3 - XMPYU A0L,A0R,ftemp1 ; m - FSTD ftemp1,-24(%sp) ; store m - - XMPYU A0R,A0R,ftemp2 ; lt - FSTD ftemp2,-16(%sp) ; store lt - - XMPYU A0L,A0L,ftemp3 ; ht - FSTD ftemp3,-8(%sp) ; store ht - - LDD -24(%sp),m ; load m - AND m,high_mask,temp2 ; m & Mask - DEPD,Z m,30,31,temp3 ; m << 32+1 - LDD -16(%sp),lt ; lt - - LDD -8(%sp),ht ; ht - EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 - ADD temp3,lt,lt ; lt = lt+m - ADD,L ht,temp1,ht ; ht += temp1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; ht++ - - ADD C2,ht,C2 ; c2=c2+ht - ADD,DC C3,%r0,C3 ; c3++ -.endm - -SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 - XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,A1L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,A1R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,A1L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD ht,ht,ht ; ht=ht+ht; - ADD,DC C3,%r0,C3 ; add in carry (c3++) - - ADD lt,lt,lt ; lt=lt+lt; - ADD,DC ht,%r0,ht ; add in carry (ht++) - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) - LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - -; -;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba8 - .PROC - .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .ENTRY - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 - SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 - STD c2,56(r_ptr) ; r[7] = c2; - COPY %r0,c2 - - SQR_ADD_C a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 - SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 - STD c3,64(r_ptr) ; r[8] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 - STD c1,72(r_ptr) ; r[9] = c1; - COPY %r0,c1 - - SQR_ADD_C a5L,a5R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 - SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 - STD c2,80(r_ptr) ; r[10] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 - STD c3,88(r_ptr) ; r[11] = c3; - COPY %r0,c3 - - SQR_ADD_C a6L,a6R,c1,c2,c3 - SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 - STD c1,96(r_ptr) ; r[12] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 - STD c2,104(r_ptr) ; r[13] = c2; - COPY %r0,c2 - - SQR_ADD_C a7L,a7R,c3,c1,c2 - STD c3, 112(r_ptr) ; r[14] = c3 - STD c1, 120(r_ptr) ; r[15] = c1 - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - STD c2,56(r_ptr) ; r[7] = c2; - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - -;--------------------------------------------------------------------------- - -MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 - XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,B0L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,B0R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,B0L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - - -; -;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba8 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - FLDD 32(b_ptr),b4 - FLDD 40(b_ptr),b5 - FLDD 48(b_ptr),b6 - FLDD 56(b_ptr),b7 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 - STD c1,48(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 - STD c2,56(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 - STD c3,64(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 - STD c1,72(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 - STD c2,80(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 - STD c3,88(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 - STD c1,96(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 - STD c2,104(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 - STD c3,112(r_ptr) - STD c1,120(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - STD c1,48(r_ptr) - STD c2,56(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - -;--- not PIC .SPACE $TEXT$ -;--- not PIC .SUBSPA $CODE$ -;--- not PIC .SPACE $PRIVATE$,SORT=16 -;--- not PIC .IMPORT $global$,DATA -;--- not PIC .SPACE $TEXT$ -;--- not PIC .SUBSPA $CODE$ -;--- not PIC .SUBSPA $LIT$,ACCESS=0x2c -;--- not PIC C$7 -;--- not PIC .ALIGN 8 -;--- not PIC .STRINGZ "Division would overflow (%d)\n" - .END diff --git a/crypto/bn/asm/pa-risc2W.s b/crypto/bn/asm/pa-risc2W.s deleted file mode 100644 index a99545754d18..000000000000 --- a/crypto/bn/asm/pa-risc2W.s +++ /dev/null @@ -1,1605 +0,0 @@ -; -; PA-RISC 64-bit implementation of bn_asm code -; -; This code is approximately 2x faster than the C version -; for RSA/DSA. -; -; See http://devresource.hp.com/ for more details on the PA-RISC -; architecture. Also see the book "PA-RISC 2.0 Architecture" -; by Gerry Kane for information on the instruction set architecture. -; -; Code written by Chris Ruemmler (with some help from the HP C -; compiler). -; -; The code compiles with HP's assembler -; - - .level 2.0W - .space $TEXT$ - .subspa $CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY - -; -; Global Register definitions used for the routines. -; -; Some information about HP's runtime architecture for 64-bits. -; -; "Caller save" means the calling function must save the register -; if it wants the register to be preserved. -; "Callee save" means if a function uses the register, it must save -; the value before using it. -; -; For the floating point registers -; -; "caller save" registers: fr4-fr11, fr22-fr31 -; "callee save" registers: fr12-fr21 -; "special" registers: fr0-fr3 (status and exception registers) -; -; For the integer registers -; value zero : r0 -; "caller save" registers: r1,r19-r26 -; "callee save" registers: r3-r18 -; return register : r2 (rp) -; return values ; r28 (ret0,ret1) -; Stack pointer ; r30 (sp) -; global data pointer ; r27 (dp) -; argument pointer ; r29 (ap) -; millicode return ptr ; r31 (also a caller save register) - - -; -; Arguments to the routines -; -r_ptr .reg %r26 -a_ptr .reg %r25 -b_ptr .reg %r24 -num .reg %r24 -w .reg %r23 -n .reg %r23 - - -; -; Globals used in some routines -; - -top_overflow .reg %r29 -high_mask .reg %r22 ; value 0xffffffff80000000L - - -;------------------------------------------------------------------------------ -; -; bn_mul_add_words -; -;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, -; int num, BN_ULONG w) -; -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = num -; arg3 = w -; -; Local register definitions -; - -fm1 .reg %fr22 -fm .reg %fr23 -ht_temp .reg %fr24 -ht_temp_1 .reg %fr25 -lt_temp .reg %fr26 -lt_temp_1 .reg %fr27 -fm1_1 .reg %fr28 -fm_1 .reg %fr29 - -fw_h .reg %fr7L -fw_l .reg %fr7R -fw .reg %fr7 - -fht_0 .reg %fr8L -flt_0 .reg %fr8R -t_float_0 .reg %fr8 - -fht_1 .reg %fr9L -flt_1 .reg %fr9R -t_float_1 .reg %fr9 - -tmp_0 .reg %r31 -tmp_1 .reg %r21 -m_0 .reg %r20 -m_1 .reg %r19 -ht_0 .reg %r1 -ht_1 .reg %r3 -lt_0 .reg %r4 -lt_1 .reg %r5 -m1_0 .reg %r6 -m1_1 .reg %r7 -rp_val .reg %r8 -rp_val_1 .reg %r9 - -bn_mul_add_words - .export bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN - .proc - .callinfo frame=128 - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP ; Needed to make the loop 16-byte aligned - NOP ; Needed to make the loop 16-byte aligned - - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - STD %r7,32(%sp) ; save r7 - STD %r8,40(%sp) ; save r8 - - STD %r9,48(%sp) ; save r9 - COPY %r0,%ret0 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - STD w,56(%sp) ; store w on stack - - CMPIB,>= 0,num,bn_mul_add_words_exit ; if (num <= 0) then exit - LDO 128(%sp),%sp ; bump stack - - ; - ; The loop is unrolled twice, so if there is only 1 number - ; then go straight to the cleanup code. - ; - CMPIB,= 1,num,bn_mul_add_words_single_top - FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_add_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDD 8(r_ptr),rp_val_1 ; rp[1] - - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = fht_1*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1[0] - FSTD fm1_1,-48(%sp) ; -48(sp) = m1[1] - - XMPYU flt_0,fw_h,fm ; m[0] = flt_0*fw_h - XMPYU flt_1,fw_h,fm_1 ; m[1] = flt_1*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m[0] - FSTD fm_1,-40(%sp) ; -40(sp) = m[1] - - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp_1 = fht_1*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht_temp - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht_temp_1 - - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt_temp - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt_temp_1 - - LDD -8(%sp),m_0 ; m[0] - LDD -40(%sp),m_1 ; m[1] - LDD -16(%sp),m1_0 ; m1[0] - LDD -48(%sp),m1_1 ; m1[1] - - LDD -24(%sp),ht_0 ; ht[0] - LDD -56(%sp),ht_1 ; ht[1] - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m[0] + m1[0]; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m[1] + m1[1]; - - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m[0] < m1[0]) - ADD,L ht_0,top_overflow,ht_0 ; ht[0] += (1<<32) - - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m[1] < m1[1]) - ADD,L ht_1,top_overflow,ht_1 ; ht[1] += (1<<32) - EXTRD,U tmp_0,31,32,m_0 ; m[0]>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1[0] = m[0]<<32 - - EXTRD,U tmp_1,31,32,m_1 ; m[1]>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1[1] = m[1]<<32 - ADD,L ht_0,m_0,ht_0 ; ht[0]+= (m[0]>>32) - ADD,L ht_1,m_1,ht_1 ; ht[1]+= (m[1]>>32) - - ADD lt_0,m1_0,lt_0 ; lt[0] = lt[0]+m1[0]; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_1,m1_1,lt_1 ; lt[1] = lt[1]+m1[1]; - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - - ADD %ret0,lt_0,lt_0 ; lt[0] = lt[0] + c; - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - ADD lt_0,rp_val,lt_0 ; lt[0] = lt[0]+rp[0] - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - - LDO -2(num),num ; num = num - 2; - ADD ht_0,lt_1,lt_1 ; lt[1] = lt[1] + ht_0 (c); - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - - ADD lt_1,rp_val_1,lt_1 ; lt[1] = lt[1]+rp[1] - ADD,DC ht_1,%r0,%ret0 ; ht[1]++ - LDO 16(a_ptr),a_ptr ; a_ptr += 2 - - STD lt_1,8(r_ptr) ; rp[1] = lt[1] - CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do - LDO 16(r_ptr),r_ptr ; r_ptr += 2 - - CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_add_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - LDD 0(r_ptr),rp_val ; rp[0] - LDO 8(a_ptr),a_ptr ; a_ptr++ - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 ; m1 = temp1 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,tmp_0 ; tmp_0 = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD %ret0,tmp_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - ADD lt_0,rp_val,lt_0 ; lt = lt+rp[0] - ADD,DC ht_0,%r0,%ret0 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_add_words_exit - .EXIT - LDD -80(%sp),%r9 ; restore r9 - LDD -88(%sp),%r8 ; restore r8 - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) -; -; arg0 = rp -; arg1 = ap -; arg2 = num -; arg3 = w - -bn_mul_words - .proc - .callinfo frame=128 - .entry - .EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - STD %r7,32(%sp) ; save r7 - COPY %r0,%ret0 ; return 0 by default - DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32 - STD w,56(%sp) ; w on stack - - CMPIB,>= 0,num,bn_mul_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; See if only 1 word to do, thus just do cleanup - ; - CMPIB,= 1,num,bn_mul_words_single_top - FLDD -72(%sp),fw ; load up w into fp register fw (fw_h/fw_l) - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - ; PA-RISC 2.0 chips have two fully pipelined multipliers, thus - ; two 32-bit mutiplies can be issued per cycle. - ; -bn_mul_words_unroll2 - - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - FLDD 8(a_ptr),t_float_1 ; load up 64-bit value (fr8L) ht(L)/lt(R) - XMPYU fht_0,fw_l,fm1 ; m1[0] = fht_0*fw_l - XMPYU fht_1,fw_l,fm1_1 ; m1[1] = ht*fw_l - - FSTD fm1,-16(%sp) ; -16(sp) = m1 - FSTD fm1_1,-48(%sp) ; -48(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - XMPYU flt_1,fw_h,fm_1 ; m = lt*fw_h - - FSTD fm,-8(%sp) ; -8(sp) = m - FSTD fm_1,-40(%sp) ; -40(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = fht_0*fw_h - XMPYU fht_1,fw_h,ht_temp_1 ; ht_temp = ht*fw_h - - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - FSTD ht_temp_1,-56(%sp) ; -56(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - XMPYU flt_1,fw_l,lt_temp_1 ; lt_temp = lt*fw_l - - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - FSTD lt_temp_1,-64(%sp) ; -64(sp) = lt - LDD -8(%sp),m_0 - LDD -40(%sp),m_1 - - LDD -16(%sp),m1_0 - LDD -48(%sp),m1_1 - LDD -24(%sp),ht_0 - LDD -56(%sp),ht_1 - - ADD,L m1_0,m_0,tmp_0 ; tmp_0 = m + m1; - ADD,L m1_1,m_1,tmp_1 ; tmp_1 = m + m1; - LDD -32(%sp),lt_0 - LDD -64(%sp),lt_1 - - CMPCLR,*>>= tmp_0,m1_0, %r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - CMPCLR,*>>= tmp_1,m1_1,%r0 ; if (m < m1) - ADD,L ht_1,top_overflow,ht_1 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - EXTRD,U tmp_1,31,32,m_1 ; m>>32 - DEPD,Z tmp_1,31,32,m1_1 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD,L ht_1,m_1,ht_1 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt = lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD lt_1,m1_1,lt_1 ; lt = lt+m1; - ADD,DC ht_1,%r0,ht_1 ; ht++ - ADD %ret0,lt_0,lt_0 ; lt = lt + c (ret0); - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD ht_0,lt_1,lt_1 ; lt = lt + c (ht_0) - ADD,DC ht_1,%r0,ht_1 ; ht++ - STD lt_0,0(r_ptr) ; rp[0] = lt - STD lt_1,8(r_ptr) ; rp[1] = lt - - COPY ht_1,%ret0 ; carry = ht - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_mul_words_unroll2 - LDO 16(r_ptr),r_ptr ; rp++ - - CMPIB,=,N 0,num,bn_mul_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_mul_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,fw_l,fm1 ; m1 = ht*fw_l - FSTD fm1,-16(%sp) ; -16(sp) = m1 - XMPYU flt_0,fw_h,fm ; m = lt*fw_h - FSTD fm,-8(%sp) ; -8(sp) = m - XMPYU fht_0,fw_h,ht_temp ; ht_temp = ht*fw_h - FSTD ht_temp,-24(%sp) ; -24(sp) = ht - XMPYU flt_0,fw_l,lt_temp ; lt_temp = lt*fw_l - FSTD lt_temp,-32(%sp) ; -32(sp) = lt - - LDD -8(%sp),m_0 - LDD -16(%sp),m1_0 - ADD,L m_0,m1_0,tmp_0 ; tmp_0 = m + m1; - LDD -24(%sp),ht_0 - LDD -32(%sp),lt_0 - - CMPCLR,*>>= tmp_0,m1_0,%r0 ; if (m < m1) - ADD,L ht_0,top_overflow,ht_0 ; ht += (1<<32) - - EXTRD,U tmp_0,31,32,m_0 ; m>>32 - DEPD,Z tmp_0,31,32,m1_0 ; m1 = m<<32 - - ADD,L ht_0,m_0,ht_0 ; ht+= (m>>32) - ADD lt_0,m1_0,lt_0 ; lt= lt+m1; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - ADD %ret0,lt_0,lt_0 ; lt = lt + c; - ADD,DC ht_0,%r0,ht_0 ; ht++ - - COPY ht_0,%ret0 ; copy carry - STD lt_0,0(r_ptr) ; rp[0] = lt - -bn_mul_words_exit - .EXIT - LDD -96(%sp),%r7 ; restore r7 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 ; restore r3 - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num) -; -; arg0 = rp -; arg1 = ap -; arg2 = num -; - -bn_sqr_words - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - NOP - STD %r5,16(%sp) ; save r5 - - CMPIB,>= 0,num,bn_sqr_words_exit - LDO 128(%sp),%sp ; bump stack - - ; - ; If only 1, the goto straight to cleanup - ; - CMPIB,= 1,num,bn_sqr_words_single_top - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; - -bn_sqr_words_unroll2 - FLDD 0(a_ptr),t_float_0 ; a[0] - FLDD 8(a_ptr),t_float_1 ; a[1] - XMPYU fht_0,flt_0,fm ; m[0] - XMPYU fht_1,flt_1,fm_1 ; m[1] - - FSTD fm,-24(%sp) ; store m[0] - FSTD fm_1,-56(%sp) ; store m[1] - XMPYU flt_0,flt_0,lt_temp ; lt[0] - XMPYU flt_1,flt_1,lt_temp_1 ; lt[1] - - FSTD lt_temp,-16(%sp) ; store lt[0] - FSTD lt_temp_1,-48(%sp) ; store lt[1] - XMPYU fht_0,fht_0,ht_temp ; ht[0] - XMPYU fht_1,fht_1,ht_temp_1 ; ht[1] - - FSTD ht_temp,-8(%sp) ; store ht[0] - FSTD ht_temp_1,-40(%sp) ; store ht[1] - LDD -24(%sp),m_0 - LDD -56(%sp),m_1 - - AND m_0,high_mask,tmp_0 ; m[0] & Mask - AND m_1,high_mask,tmp_1 ; m[1] & Mask - DEPD,Z m_0,30,31,m_0 ; m[0] << 32+1 - DEPD,Z m_1,30,31,m_1 ; m[1] << 32+1 - - LDD -16(%sp),lt_0 - LDD -48(%sp),lt_1 - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m[0]&Mask >> 32-1 - EXTRD,U tmp_1,32,33,tmp_1 ; tmp_1 = m[1]&Mask >> 32-1 - - LDD -8(%sp),ht_0 - LDD -40(%sp),ht_1 - ADD,L ht_0,tmp_0,ht_0 ; ht[0] += tmp_0 - ADD,L ht_1,tmp_1,ht_1 ; ht[1] += tmp_1 - - ADD lt_0,m_0,lt_0 ; lt = lt+m - ADD,DC ht_0,%r0,ht_0 ; ht[0]++ - STD lt_0,0(r_ptr) ; rp[0] = lt[0] - STD ht_0,8(r_ptr) ; rp[1] = ht[1] - - ADD lt_1,m_1,lt_1 ; lt = lt+m - ADD,DC ht_1,%r0,ht_1 ; ht[1]++ - STD lt_1,16(r_ptr) ; rp[2] = lt[1] - STD ht_1,24(r_ptr) ; rp[3] = ht[1] - - LDO -2(num),num ; num = num - 2; - LDO 16(a_ptr),a_ptr ; ap += 2 - CMPIB,<= 2,num,bn_sqr_words_unroll2 - LDO 32(r_ptr),r_ptr ; rp += 4 - - CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done? - - ; - ; Top of loop aligned on 64-byte boundary - ; -bn_sqr_words_single_top - FLDD 0(a_ptr),t_float_0 ; load up 64-bit value (fr8L) ht(L)/lt(R) - - XMPYU fht_0,flt_0,fm ; m - FSTD fm,-24(%sp) ; store m - - XMPYU flt_0,flt_0,lt_temp ; lt - FSTD lt_temp,-16(%sp) ; store lt - - XMPYU fht_0,fht_0,ht_temp ; ht - FSTD ht_temp,-8(%sp) ; store ht - - LDD -24(%sp),m_0 ; load m - AND m_0,high_mask,tmp_0 ; m & Mask - DEPD,Z m_0,30,31,m_0 ; m << 32+1 - LDD -16(%sp),lt_0 ; lt - - LDD -8(%sp),ht_0 ; ht - EXTRD,U tmp_0,32,33,tmp_0 ; tmp_0 = m&Mask >> 32-1 - ADD m_0,lt_0,lt_0 ; lt = lt+m - ADD,L ht_0,tmp_0,ht_0 ; ht += tmp_0 - ADD,DC ht_0,%r0,ht_0 ; ht++ - - STD lt_0,0(r_ptr) ; rp[0] = lt - STD ht_0,8(r_ptr) ; rp[1] = ht - -bn_sqr_words_exit - .EXIT - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - .PROCEND ;in=23,24,25,26,29;out=28; - - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t .reg %r22 -b .reg %r21 -l .reg %r20 - -bn_add_words - .proc - .entry - .callinfo - .EXPORT bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .align 64 - - CMPIB,>= 0,n,bn_add_words_exit - COPY %r0,%ret0 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_add_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_add_words_unroll2 - LDD 0(a_ptr),t - LDD 0(b_ptr),b - ADD t,%ret0,t ; t = t+c; - ADD,DC %r0,%r0,%ret0 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret0,%r0,%ret0 ; c+= carry - STD l,0(r_ptr) - - LDD 8(a_ptr),t - LDD 8(b_ptr),b - ADD t,%ret0,t ; t = t+c; - ADD,DC %r0,%r0,%ret0 ; set c to carry - ADD t,b,l ; l = t + b[0] - ADD,DC %ret0,%r0,%ret0 ; c+= carry - STD l,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_add_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_add_words_exit ; are we done? - -bn_add_words_single_top - LDD 0(a_ptr),t - LDD 0(b_ptr),b - - ADD t,%ret0,t ; t = t+c; - ADD,DC %r0,%r0,%ret0 ; set c to carry (could use CMPCLR??) - ADD t,b,l ; l = t + b[0] - ADD,DC %ret0,%r0,%ret0 ; c+= carry - STD l,0(r_ptr) - -bn_add_words_exit - .EXIT - BVE (%rp) - NOP - .PROCEND ;in=23,24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) -; -; arg0 = rp -; arg1 = ap -; arg2 = bp -; arg3 = n - -t1 .reg %r22 -t2 .reg %r21 -sub_tmp1 .reg %r20 -sub_tmp2 .reg %r19 - - -bn_sub_words - .proc - .callinfo - .EXPORT bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - CMPIB,>= 0,n,bn_sub_words_exit - COPY %r0,%ret0 ; return 0 by default - - ; - ; If 2 or more numbers do the loop - ; - CMPIB,= 1,n,bn_sub_words_single_top - NOP - - ; - ; This loop is unrolled 2 times (64-byte aligned as well) - ; -bn_sub_words_unroll2 - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; - - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret0 - STD sub_tmp1,0(r_ptr) - - LDD 8(a_ptr),t1 - LDD 8(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret0 - STD sub_tmp1,8(r_ptr) - - LDO -2(n),n - LDO 16(a_ptr),a_ptr - LDO 16(b_ptr),b_ptr - - CMPIB,<= 2,n,bn_sub_words_unroll2 - LDO 16(r_ptr),r_ptr - - CMPIB,=,N 0,n,bn_sub_words_exit ; are we done? - -bn_sub_words_single_top - LDD 0(a_ptr),t1 - LDD 0(b_ptr),t2 - SUB t1,t2,sub_tmp1 ; t3 = t1-t2; - SUB sub_tmp1,%ret0,sub_tmp1 ; t3 = t3- c; - CMPCLR,*>> t1,t2,sub_tmp2 ; clear if t1 > t2 - LDO 1(%r0),sub_tmp2 - - CMPCLR,*= t1,t2,%r0 - COPY sub_tmp2,%ret0 - - STD sub_tmp1,0(r_ptr) - -bn_sub_words_exit - .EXIT - BVE (%rp) - NOP - .PROCEND ;in=23,24,25,26,29;out=28; - -;------------------------------------------------------------------------------ -; -; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d) -; -; arg0 = h -; arg1 = l -; arg2 = d -; -; This is mainly just modified assembly from the compiler, thus the -; lack of variable names. -; -;------------------------------------------------------------------------------ -bn_div_words - .proc - .callinfo CALLER,FRAME=272,ENTRY_GR=%r10,SAVE_RP,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_div_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .IMPORT BN_num_bits_word,CODE,NO_RELOCATION - .IMPORT __iob,DATA - .IMPORT fprintf,CODE,NO_RELOCATION - .IMPORT abort,CODE,NO_RELOCATION - .IMPORT $$div2U,MILLICODE - .entry - STD %r2,-16(%r30) - STD,MA %r3,352(%r30) - STD %r4,-344(%r30) - STD %r5,-336(%r30) - STD %r6,-328(%r30) - STD %r7,-320(%r30) - STD %r8,-312(%r30) - STD %r9,-304(%r30) - STD %r10,-296(%r30) - - STD %r27,-288(%r30) ; save gp - - COPY %r24,%r3 ; save d - COPY %r26,%r4 ; save h (high 64-bits) - LDO -1(%r0),%ret0 ; return -1 by default - - CMPB,*= %r0,%arg2,$D3 ; if (d == 0) - COPY %r25,%r5 ; save l (low 64-bits) - - LDO -48(%r30),%r29 ; create ap - .CALL ;in=26,29;out=28; - B,L BN_num_bits_word,%r2 - COPY %r3,%r26 - LDD -288(%r30),%r27 ; restore gp - LDI 64,%r21 - - CMPB,= %r21,%ret0,$00000012 ;if (i == 64) (forward) - COPY %ret0,%r24 ; i - MTSARCM %r24 - DEPDI,Z -1,%sar,1,%r29 - CMPB,*<<,N %r29,%r4,bn_div_err_case ; if (h > 1<<i) (forward) - -$00000012 - SUBI 64,%r24,%r31 ; i = 64 - i; - CMPCLR,*<< %r4,%r3,%r0 ; if (h >= d) - SUB %r4,%r3,%r4 ; h -= d - CMPB,= %r31,%r0,$0000001A ; if (i) - COPY %r0,%r10 ; ret = 0 - MTSARCM %r31 ; i to shift - DEPD,Z %r3,%sar,64,%r3 ; d <<= i; - SUBI 64,%r31,%r19 ; 64 - i; redundent - MTSAR %r19 ; (64 -i) to shift - SHRPD %r4,%r5,%sar,%r4 ; l>> (64-i) - MTSARCM %r31 ; i to shift - DEPD,Z %r5,%sar,64,%r5 ; l <<= i; - -$0000001A - DEPDI,Z -1,31,32,%r19 - EXTRD,U %r3,31,32,%r6 ; dh=(d&0xfff)>>32 - EXTRD,U %r3,63,32,%r8 ; dl = d&0xffffff - LDO 2(%r0),%r9 - STD %r3,-280(%r30) ; "d" to stack - -$0000001C - DEPDI,Z -1,63,32,%r29 ; - EXTRD,U %r4,31,32,%r31 ; h >> 32 - CMPB,*=,N %r31,%r6,$D2 ; if ((h>>32) != dh)(forward) div - COPY %r4,%r26 - EXTRD,U %r4,31,32,%r25 - COPY %r6,%r24 - .CALL ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL) - B,L $$div2U,%r2 - EXTRD,U %r6,31,32,%r23 - DEPD %r28,31,32,%r29 -$D2 - STD %r29,-272(%r30) ; q - AND %r5,%r19,%r24 ; t & 0xffffffff00000000; - EXTRD,U %r24,31,32,%r24 ; ??? - FLDD -272(%r30),%fr7 ; q - FLDD -280(%r30),%fr8 ; d - XMPYU %fr8L,%fr7L,%fr10 - FSTD %fr10,-256(%r30) - XMPYU %fr8L,%fr7R,%fr22 - FSTD %fr22,-264(%r30) - XMPYU %fr8R,%fr7L,%fr11 - XMPYU %fr8R,%fr7R,%fr23 - FSTD %fr11,-232(%r30) - FSTD %fr23,-240(%r30) - LDD -256(%r30),%r28 - DEPD,Z %r28,31,32,%r2 - LDD -264(%r30),%r20 - ADD,L %r20,%r2,%r31 - LDD -232(%r30),%r22 - DEPD,Z %r22,31,32,%r22 - LDD -240(%r30),%r21 - B $00000024 ; enter loop - ADD,L %r21,%r22,%r23 - -$0000002A - LDO -1(%r29),%r29 - SUB %r23,%r8,%r23 -$00000024 - SUB %r4,%r31,%r25 - AND %r25,%r19,%r26 - CMPB,*<>,N %r0,%r26,$00000046 ; (forward) - DEPD,Z %r25,31,32,%r20 - OR %r20,%r24,%r21 - CMPB,*<<,N %r21,%r23,$0000002A ;(backward) - SUB %r31,%r6,%r31 -;-------------Break path--------------------- - -$00000046 - DEPD,Z %r23,31,32,%r25 ;tl - EXTRD,U %r23,31,32,%r26 ;t - AND %r25,%r19,%r24 ;tl = (tl<<32)&0xfffffff0000000L - ADD,L %r31,%r26,%r31 ;th += t; - CMPCLR,*>>= %r5,%r24,%r0 ;if (l<tl) - LDO 1(%r31),%r31 ; th++; - CMPB,*<<=,N %r31,%r4,$00000036 ;if (n < th) (forward) - LDO -1(%r29),%r29 ;q--; - ADD,L %r4,%r3,%r4 ;h += d; -$00000036 - ADDIB,=,N -1,%r9,$D1 ;if (--count == 0) break (forward) - SUB %r5,%r24,%r28 ; l -= tl; - SUB %r4,%r31,%r24 ; h -= th; - SHRPD %r24,%r28,32,%r4 ; h = ((h<<32)|(l>>32)); - DEPD,Z %r29,31,32,%r10 ; ret = q<<32 - b $0000001C - DEPD,Z %r28,31,32,%r5 ; l = l << 32 - -$D1 - OR %r10,%r29,%r28 ; ret |= q -$D3 - LDD -368(%r30),%r2 -$D0 - LDD -296(%r30),%r10 - LDD -304(%r30),%r9 - LDD -312(%r30),%r8 - LDD -320(%r30),%r7 - LDD -328(%r30),%r6 - LDD -336(%r30),%r5 - LDD -344(%r30),%r4 - BVE (%r2) - .EXIT - LDD,MB -352(%r30),%r3 - -bn_div_err_case - MFIA %r6 - ADDIL L'bn_div_words-bn_div_err_case,%r6,%r1 - LDO R'bn_div_words-bn_div_err_case(%r1),%r6 - ADDIL LT'__iob,%r27,%r1 - LDD RT'__iob(%r1),%r26 - ADDIL L'C$4-bn_div_words,%r6,%r1 - LDO R'C$4-bn_div_words(%r1),%r25 - LDO 64(%r26),%r26 - .CALL ;in=24,25,26,29;out=28; - B,L fprintf,%r2 - LDO -48(%r30),%r29 - LDD -288(%r30),%r27 - .CALL ;in=29; - B,L abort,%r2 - LDO -48(%r30),%r29 - LDD -288(%r30),%r27 - B $D0 - LDD -368(%r30),%r2 - .PROCEND ;in=24,25,26,29;out=28; - -;---------------------------------------------------------------------------- -; -; Registers to hold 64-bit values to manipulate. The "L" part -; of the register corresponds to the upper 32-bits, while the "R" -; part corresponds to the lower 32-bits -; -; Note, that when using b6 and b7, the code must save these before -; using them because they are callee save registers -; -; -; Floating point registers to use to save values that -; are manipulated. These don't collide with ftemp1-6 and -; are all caller save registers -; -a0 .reg %fr22 -a0L .reg %fr22L -a0R .reg %fr22R - -a1 .reg %fr23 -a1L .reg %fr23L -a1R .reg %fr23R - -a2 .reg %fr24 -a2L .reg %fr24L -a2R .reg %fr24R - -a3 .reg %fr25 -a3L .reg %fr25L -a3R .reg %fr25R - -a4 .reg %fr26 -a4L .reg %fr26L -a4R .reg %fr26R - -a5 .reg %fr27 -a5L .reg %fr27L -a5R .reg %fr27R - -a6 .reg %fr28 -a6L .reg %fr28L -a6R .reg %fr28R - -a7 .reg %fr29 -a7L .reg %fr29L -a7R .reg %fr29R - -b0 .reg %fr30 -b0L .reg %fr30L -b0R .reg %fr30R - -b1 .reg %fr31 -b1L .reg %fr31L -b1R .reg %fr31R - -; -; Temporary floating point variables, these are all caller save -; registers -; -ftemp1 .reg %fr4 -ftemp2 .reg %fr5 -ftemp3 .reg %fr6 -ftemp4 .reg %fr7 - -; -; The B set of registers when used. -; - -b2 .reg %fr8 -b2L .reg %fr8L -b2R .reg %fr8R - -b3 .reg %fr9 -b3L .reg %fr9L -b3R .reg %fr9R - -b4 .reg %fr10 -b4L .reg %fr10L -b4R .reg %fr10R - -b5 .reg %fr11 -b5L .reg %fr11L -b5R .reg %fr11R - -b6 .reg %fr12 -b6L .reg %fr12L -b6R .reg %fr12R - -b7 .reg %fr13 -b7L .reg %fr13L -b7R .reg %fr13R - -c1 .reg %r21 ; only reg -temp1 .reg %r20 ; only reg -temp2 .reg %r19 ; only reg -temp3 .reg %r31 ; only reg - -m1 .reg %r28 -c2 .reg %r23 -high_one .reg %r1 -ht .reg %r6 -lt .reg %r5 -m .reg %r4 -c3 .reg %r3 - -SQR_ADD_C .macro A0L,A0R,C1,C2,C3 - XMPYU A0L,A0R,ftemp1 ; m - FSTD ftemp1,-24(%sp) ; store m - - XMPYU A0R,A0R,ftemp2 ; lt - FSTD ftemp2,-16(%sp) ; store lt - - XMPYU A0L,A0L,ftemp3 ; ht - FSTD ftemp3,-8(%sp) ; store ht - - LDD -24(%sp),m ; load m - AND m,high_mask,temp2 ; m & Mask - DEPD,Z m,30,31,temp3 ; m << 32+1 - LDD -16(%sp),lt ; lt - - LDD -8(%sp),ht ; ht - EXTRD,U temp2,32,33,temp1 ; temp1 = m&Mask >> 32-1 - ADD temp3,lt,lt ; lt = lt+m - ADD,L ht,temp1,ht ; ht += temp1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; ht++ - - ADD C2,ht,C2 ; c2=c2+ht - ADD,DC C3,%r0,C3 ; c3++ -.endm - -SQR_ADD_C2 .macro A0L,A0R,A1L,A1R,C1,C2,C3 - XMPYU A0L,A1R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,A1L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,A1R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,A1L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD ht,ht,ht ; ht=ht+ht; - ADD,DC C3,%r0,C3 ; add in carry (c3++) - - ADD lt,lt,lt ; lt=lt+lt; - ADD,DC ht,%r0,ht ; add in carry (ht++) - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC,*NUV ht,%r0,ht ; add in carry (ht++) - LDO 1(C3),C3 ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - -; -;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba8 - .PROC - .CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .ENTRY - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1 - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2 - SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1 - SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1 - STD c2,56(r_ptr) ; r[7] = c2; - COPY %r0,c2 - - SQR_ADD_C a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2 - SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2 - STD c3,64(r_ptr) ; r[8] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3 - SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3 - SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3 - STD c1,72(r_ptr) ; r[9] = c1; - COPY %r0,c1 - - SQR_ADD_C a5L,a5R,c2,c3,c1 - SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1 - SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1 - STD c2,80(r_ptr) ; r[10] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2 - SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2 - STD c3,88(r_ptr) ; r[11] = c3; - COPY %r0,c3 - - SQR_ADD_C a6L,a6R,c1,c2,c3 - SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3 - STD c1,96(r_ptr) ; r[12] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1 - STD c2,104(r_ptr) ; r[13] = c2; - COPY %r0,c2 - - SQR_ADD_C a7L,a7R,c3,c1,c2 - STD c3, 112(r_ptr) ; r[14] = c3 - STD c1, 120(r_ptr) ; r[15] = c1 - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) -; arg0 = r_ptr -; arg1 = a_ptr -; - -bn_sqr_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z -1,32,33,high_mask ; Create Mask 0xffffffff80000000L - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - SQR_ADD_C a0L,a0R,c1,c2,c3 - - STD c1,0(r_ptr) ; r[0] = c1; - COPY %r0,c1 - - SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1 - - STD c2,8(r_ptr) ; r[1] = c2; - COPY %r0,c2 - - SQR_ADD_C a1L,a1R,c3,c1,c2 - SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2 - - STD c3,16(r_ptr) ; r[2] = c3; - COPY %r0,c3 - - SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3 - SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3 - - STD c1,24(r_ptr) ; r[3] = c1; - COPY %r0,c1 - - SQR_ADD_C a2L,a2R,c2,c3,c1 - SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1 - - STD c2,32(r_ptr) ; r[4] = c2; - COPY %r0,c2 - - SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2 - STD c3,40(r_ptr) ; r[5] = c3; - COPY %r0,c3 - - SQR_ADD_C a3L,a3R,c1,c2,c3 - STD c1,48(r_ptr) ; r[6] = c1; - STD c2,56(r_ptr) ; r[7] = c2; - - .EXIT - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - -;--------------------------------------------------------------------------- - -MUL_ADD_C .macro A0L,A0R,B0L,B0R,C1,C2,C3 - XMPYU A0L,B0R,ftemp1 ; m1 = bl*ht - FSTD ftemp1,-16(%sp) ; - XMPYU A0R,B0L,ftemp2 ; m = bh*lt - FSTD ftemp2,-8(%sp) ; - XMPYU A0R,B0R,ftemp3 ; lt = bl*lt - FSTD ftemp3,-32(%sp) - XMPYU A0L,B0L,ftemp4 ; ht = bh*ht - FSTD ftemp4,-24(%sp) ; - - LDD -8(%sp),m ; r21 = m - LDD -16(%sp),m1 ; r19 = m1 - ADD,L m,m1,m ; m+m1 - - DEPD,Z m,31,32,temp3 ; (m+m1<<32) - LDD -24(%sp),ht ; r24 = ht - - CMPCLR,*>>= m,m1,%r0 ; if (m < m1) - ADD,L ht,high_one,ht ; ht+=high_one - - EXTRD,U m,31,32,temp1 ; m >> 32 - LDD -32(%sp),lt ; lt - ADD,L ht,temp1,ht ; ht+= m>>32 - ADD lt,temp3,lt ; lt = lt+m1 - ADD,DC ht,%r0,ht ; ht++ - - ADD C1,lt,C1 ; c1=c1+lt - ADD,DC ht,%r0,ht ; bump c3 if overflow,nullify otherwise - - ADD C2,ht,C2 ; c2 = c2 + ht - ADD,DC C3,%r0,C3 ; add in carry (c3++) -.endm - - -; -;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba8 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - FLDD 32(a_ptr),a4 - FLDD 40(a_ptr),a5 - FLDD 48(a_ptr),a6 - FLDD 56(a_ptr),a7 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - FLDD 32(b_ptr),b4 - FLDD 40(b_ptr),b5 - FLDD 48(b_ptr),b6 - FLDD 56(b_ptr),b7 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3 - STD c1,48(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1 - STD c2,56(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2 - STD c3,64(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3 - STD c1,72(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1 - MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1 - MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1 - MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1 - MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1 - STD c2,80(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2 - MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2 - MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2 - MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2 - STD c3,88(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3 - MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3 - MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3 - STD c1,96(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1 - MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1 - STD c2,104(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2 - STD c3,112(r_ptr) - STD c1,120(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - -;----------------------------------------------------------------------------- -; -;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) -; arg0 = r_ptr -; arg1 = a_ptr -; arg2 = b_ptr -; - -bn_mul_comba4 - .proc - .callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE - .EXPORT bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN - .entry - .align 64 - - STD %r3,0(%sp) ; save r3 - STD %r4,8(%sp) ; save r4 - STD %r5,16(%sp) ; save r5 - STD %r6,24(%sp) ; save r6 - FSTD %fr12,32(%sp) ; save r6 - FSTD %fr13,40(%sp) ; save r7 - - ; - ; Zero out carries - ; - COPY %r0,c1 - COPY %r0,c2 - COPY %r0,c3 - - LDO 128(%sp),%sp ; bump stack - DEPDI,Z 1,31,1,high_one ; Create Value 1 << 32 - - ; - ; Load up all of the values we are going to use - ; - FLDD 0(a_ptr),a0 - FLDD 8(a_ptr),a1 - FLDD 16(a_ptr),a2 - FLDD 24(a_ptr),a3 - - FLDD 0(b_ptr),b0 - FLDD 8(b_ptr),b1 - FLDD 16(b_ptr),b2 - FLDD 24(b_ptr),b3 - - MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3 - STD c1,0(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1 - STD c2,8(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2 - MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2 - MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2 - STD c3,16(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3 - MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3 - MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3 - MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3 - STD c1,24(r_ptr) - COPY %r0,c1 - - MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1 - MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1 - MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1 - STD c2,32(r_ptr) - COPY %r0,c2 - - MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2 - MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2 - STD c3,40(r_ptr) - COPY %r0,c3 - - MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3 - STD c1,48(r_ptr) - STD c2,56(r_ptr) - - .EXIT - FLDD -88(%sp),%fr13 - FLDD -96(%sp),%fr12 - LDD -104(%sp),%r6 ; restore r6 - LDD -112(%sp),%r5 ; restore r5 - LDD -120(%sp),%r4 ; restore r4 - BVE (%rp) - LDD,MB -128(%sp),%r3 - - .PROCEND - - - .SPACE $TEXT$ - .SUBSPA $CODE$ - .SPACE $PRIVATE$,SORT=16 - .IMPORT $global$,DATA - .SPACE $TEXT$ - .SUBSPA $CODE$ - .SUBSPA $LIT$,ACCESS=0x2c -C$4 - .ALIGN 8 - .STRINGZ "Division would overflow (%d)\n" - .END diff --git a/crypto/bn/asm/parisc-mont.pl b/crypto/bn/asm/parisc-mont.pl index c02ef6f01466..aa9f626ed267 100755 --- a/crypto/bn/asm/parisc-mont.pl +++ b/crypto/bn/asm/parisc-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -14,7 +21,7 @@ # optimal in respect to instruction set capabilities. Fair comparison # with vendor compiler is problematic, because OpenSSL doesn't define # BN_LLONG [presumably] for historical reasons, which drives compiler -# toward 4 times 16x16=32-bit multiplicatons [plus complementary +# toward 4 times 16x16=32-bit multiplications [plus complementary # shifts and additions] instead. This means that you should observe # several times improvement over code generated by vendor compiler # for PA-RISC 1.1, but the "baseline" is far from optimal. The actual @@ -126,7 +133,7 @@ $fp="%r3"; $hi1="%r2"; $hi0="%r1"; -$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s +$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s $fm0="%fr4"; $fti=$fm0; $fbi="%fr5L"; @@ -510,7 +517,6 @@ L\$sub stws,ma $hi1,4($rp) subb $ti0,%r0,$hi1 - ldo -4($tp),$tp ___ $code.=<<___ if ($BN_SZ==8); ldd,ma 8($tp),$ti0 @@ -525,21 +531,19 @@ L\$sub extrd,u $ti0,31,32,$ti0 ; carry in flipped word order sub,db $ti0,%r0,$hi1 - ldo -8($tp),$tp ___ $code.=<<___; - and $tp,$hi1,$ap - andcm $rp,$hi1,$bp - or $ap,$bp,$np - + ldo `$LOCALS+32`($fp),$tp sub $rp,$arrsz,$rp ; rewind rp subi 0,$arrsz,$idx - ldo `$LOCALS+32`($fp),$tp L\$copy - ldd $idx($np),$hi0 + ldd 0($tp),$ti0 + ldd 0($rp),$hi0 std,ma %r0,8($tp) - addib,<> 8,$idx,.-8 ; L\$copy - std,ma $hi0,8($rp) + comiclr,= 0,$hi1,%r0 + copy $ti0,$hi0 + addib,<> 8,$idx,L\$copy + std,ma $hi0,8($rp) ___ if ($BN_SZ==4) { # PA-RISC 1.1 code-path @@ -849,19 +853,18 @@ L\$sub_pa11 stws,ma $hi1,4($rp) subb $ti0,%r0,$hi1 - ldo -4($tp),$tp - and $tp,$hi1,$ap - andcm $rp,$hi1,$bp - or $ap,$bp,$np + ldo `$LOCALS+32`($fp),$tp sub $rp,$arrsz,$rp ; rewind rp subi 0,$arrsz,$idx - ldo `$LOCALS+32`($fp),$tp L\$copy_pa11 - ldwx $idx($np),$hi0 + ldw 0($tp),$ti0 + ldw 0($rp),$hi0 stws,ma %r0,4($tp) + comiclr,= 0,$hi1,%r0 + copy $ti0,$hi0 addib,<> 4,$idx,L\$copy_pa11 - stws,ma $hi0,4($rp) + stws,ma $hi0,4($rp) nop ; alignment L\$done @@ -981,6 +984,11 @@ sub assemble { ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; } +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler/) { + $gnuas = 1; +} + foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; # flip word order in 64-bit mode... @@ -988,7 +996,10 @@ foreach (split("\n",$code)) { # assemble 2.0 instructions in 32-bit mode... s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); - s/\bbv\b/bve/gm if ($SIZE_T==8); + s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); + s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); + s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); + s/\bbv\b/bve/ if ($SIZE_T==8); print $_,"\n"; } diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index 6930a3acebd2..ec7e019a4380 100755 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -19,11 +26,21 @@ # So far RSA *sign* performance improvement over pre-bn_mul_mont asm # for 64-bit application running on PPC970/G5 is: # -# 512-bit +65% +# 512-bit +65% # 1024-bit +35% # 2048-bit +18% # 4096-bit +4% +# September 2016 +# +# Add multiplication procedure operating on lengths divisible by 4 +# and squaring procedure operating on lengths divisible by 8. Length +# is expressed in number of limbs. RSA private key operations are +# ~35-50% faster (more for longer keys) on contemporary high-end POWER +# processors in 64-bit builds, [mysteriously enough] more in 32-bit +# builds. On low-end 32-bit processors performance improvement turned +# to be marginal... + $flavour = shift; if ($flavour =~ /32/) { @@ -42,7 +59,8 @@ if ($flavour =~ /32/) { $UMULL= "mullw"; # unsigned multiply low $UMULH= "mulhwu"; # unsigned multiply high $UCMP= "cmplw"; # unsigned compare - $SHRI= "srwi"; # unsigned shift right by immediate + $SHRI= "srwi"; # unsigned shift right by immediate + $SHLI= "slwi"; # unsigned shift left by immediate $PUSH= $ST; $POP= $LD; } elsif ($flavour =~ /64/) { @@ -62,7 +80,8 @@ if ($flavour =~ /32/) { $UMULL= "mulld"; # unsigned multiply low $UMULH= "mulhdu"; # unsigned multiply high $UCMP= "cmpld"; # unsigned compare - $SHRI= "srdi"; # unsigned shift right by immediate + $SHRI= "srdi"; # unsigned shift right by immediate + $SHLI= "sldi"; # unsigned shift left by immediate $PUSH= $ST; $POP= $LD; } else { die "nonsense $flavour"; } @@ -79,43 +98,44 @@ open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; $sp="r1"; $toc="r2"; -$rp="r3"; $ovf="r3"; +$rp="r3"; $ap="r4"; $bp="r5"; $np="r6"; $n0="r7"; $num="r8"; -$rp="r9"; # $rp is reassigned -$aj="r10"; -$nj="r11"; -$tj="r12"; + +{ +my $ovf=$rp; +my $rp="r9"; # $rp is reassigned +my $aj="r10"; +my $nj="r11"; +my $tj="r12"; # non-volatile registers -$i="r20"; -$j="r21"; -$tp="r22"; -$m0="r23"; -$m1="r24"; -$lo0="r25"; -$hi0="r26"; -$lo1="r27"; -$hi1="r28"; -$alo="r29"; -$ahi="r30"; -$nlo="r31"; +my $i="r20"; +my $j="r21"; +my $tp="r22"; +my $m0="r23"; +my $m1="r24"; +my $lo0="r25"; +my $hi0="r26"; +my $lo1="r27"; +my $hi1="r28"; +my $alo="r29"; +my $ahi="r30"; +my $nlo="r31"; # -$nhi="r0"; +my $nhi="r0"; $code=<<___; .machine "any" .text .globl .bn_mul_mont_int -.align 4 +.align 5 .bn_mul_mont_int: - cmpwi $num,4 mr $rp,r3 ; $rp is reassigned li r3,0 - bltlr ___ $code.=<<___ if ($BNSZ==4); cmpwi $num,32 ; longer key performance is not better @@ -294,15 +314,16 @@ Lsub: $LDX $tj,$tp,$j li $j,0 mtctr $num subfe $ovf,$j,$ovf ; handle upmost overflow bit - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp .align 4 -Lcopy: ; copy or in-place refresh - $LDX $tj,$ap,$j - $STX $tj,$rp,$j +Lcopy: ; conditional copy + $LDX $tj,$tp,$j + $LDX $aj,$rp,$j + and $tj,$tj,$ovf + andc $aj,$aj,$ovf $STX $j,$tp,$j ; zap at once + or $aj,$aj,$tj + $STX $aj,$rp,$j addi $j,$j,$BNSZ bdnz Lcopy @@ -326,7 +347,1641 @@ Lcopy: ; copy or in-place refresh .byte 0,12,4,0,0x80,12,6,0 .long 0 .size .bn_mul_mont_int,.-.bn_mul_mont_int +___ +} +if (1) { +my ($a0,$a1,$a2,$a3, + $t0,$t1,$t2,$t3, + $m0,$m1,$m2,$m3, + $acc0,$acc1,$acc2,$acc3,$acc4, + $bi,$mi,$tp,$ap_end,$cnt) = map("r$_",(9..12,14..31)); +my ($carry,$zero) = ($rp,"r0"); + +# sp----------->+-------------------------------+ +# | saved sp | +# +-------------------------------+ +# . . +# +8*size_t +-------------------------------+ +# | 4 "n0*t0" | +# . . +# . . +# +12*size_t +-------------------------------+ +# | size_t tmp[num] | +# . . +# . . +# . . +# +-------------------------------+ +# | topmost carry | +# . . +# -18*size_t +-------------------------------+ +# | 18 saved gpr, r14-r31 | +# . . +# . . +# +-------------------------------+ +$code.=<<___; +.globl .bn_mul4x_mont_int +.align 5 +.bn_mul4x_mont_int: + andi. r0,$num,7 + bne .Lmul4x_do + $UCMP $ap,$bp + bne .Lmul4x_do + b .Lsqr8x_do +.Lmul4x_do: + slwi $num,$num,`log($SIZE_T)/log(2)` + mr $a0,$sp + li $a1,-32*$SIZE_T + sub $a1,$a1,$num + $STUX $sp,$sp,$a1 # alloca + + $PUSH r14,-$SIZE_T*18($a0) + $PUSH r15,-$SIZE_T*17($a0) + $PUSH r16,-$SIZE_T*16($a0) + $PUSH r17,-$SIZE_T*15($a0) + $PUSH r18,-$SIZE_T*14($a0) + $PUSH r19,-$SIZE_T*13($a0) + $PUSH r20,-$SIZE_T*12($a0) + $PUSH r21,-$SIZE_T*11($a0) + $PUSH r22,-$SIZE_T*10($a0) + $PUSH r23,-$SIZE_T*9($a0) + $PUSH r24,-$SIZE_T*8($a0) + $PUSH r25,-$SIZE_T*7($a0) + $PUSH r26,-$SIZE_T*6($a0) + $PUSH r27,-$SIZE_T*5($a0) + $PUSH r28,-$SIZE_T*4($a0) + $PUSH r29,-$SIZE_T*3($a0) + $PUSH r30,-$SIZE_T*2($a0) + $PUSH r31,-$SIZE_T*1($a0) + + subi $ap,$ap,$SIZE_T # bias by -1 + subi $np,$np,$SIZE_T # bias by -1 + subi $rp,$rp,$SIZE_T # bias by -1 + $LD $n0,0($n0) # *n0 + + add $t0,$bp,$num + add $ap_end,$ap,$num + subi $t0,$t0,$SIZE_T*4 # &b[num-4] + + $LD $bi,$SIZE_T*0($bp) # b[0] + li $acc0,0 + $LD $a0,$SIZE_T*1($ap) # a[0..3] + li $acc1,0 + $LD $a1,$SIZE_T*2($ap) + li $acc2,0 + $LD $a2,$SIZE_T*3($ap) + li $acc3,0 + $LDU $a3,$SIZE_T*4($ap) + $LD $m0,$SIZE_T*1($np) # n[0..3] + $LD $m1,$SIZE_T*2($np) + $LD $m2,$SIZE_T*3($np) + $LDU $m3,$SIZE_T*4($np) + + $PUSH $rp,$SIZE_T*6($sp) # offload rp and &b[num-4] + $PUSH $t0,$SIZE_T*7($sp) + li $carry,0 + addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit + li $cnt,0 + li $zero,0 + b .Loop_mul4x_1st_reduction + +.align 5 +.Loop_mul4x_1st_reduction: + $UMULL $t0,$a0,$bi # lo(a[0..3]*b[0]) + addze $carry,$carry # modulo-scheduled + $UMULL $t1,$a1,$bi + addi $cnt,$cnt,$SIZE_T + $UMULL $t2,$a2,$bi + andi. $cnt,$cnt,$SIZE_T*4-1 + $UMULL $t3,$a3,$bi + addc $acc0,$acc0,$t0 + $UMULH $t0,$a0,$bi # hi(a[0..3]*b[0]) + adde $acc1,$acc1,$t1 + $UMULH $t1,$a1,$bi + adde $acc2,$acc2,$t2 + $UMULL $mi,$acc0,$n0 # t[0]*n0 + adde $acc3,$acc3,$t3 + $UMULH $t2,$a2,$bi + addze $acc4,$zero + $UMULH $t3,$a3,$bi + $LDX $bi,$bp,$cnt # next b[i] (or b[0]) + addc $acc1,$acc1,$t0 + # (*) mul $t0,$m0,$mi # lo(n[0..3]*t[0]*n0) + $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing + adde $acc2,$acc2,$t1 + $UMULL $t1,$m1,$mi + adde $acc3,$acc3,$t2 + $UMULL $t2,$m2,$mi + adde $acc4,$acc4,$t3 # can't overflow + $UMULL $t3,$m3,$mi + # (*) addc $acc0,$acc0,$t0 + # (*) As for removal of first multiplication and addition + # instructions. The outcome of first addition is + # guaranteed to be zero, which leaves two computationally + # significant outcomes: it either carries or not. Then + # question is when does it carry? Is there alternative + # way to deduce it? If you follow operations, you can + # observe that condition for carry is quite simple: + # $acc0 being non-zero. So that carry can be calculated + # by adding -1 to $acc0. That's what next instruction does. + addic $acc0,$acc0,-1 # (*), discarded + $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0) + adde $acc0,$acc1,$t1 + $UMULH $t1,$m1,$mi + adde $acc1,$acc2,$t2 + $UMULH $t2,$m2,$mi + adde $acc2,$acc3,$t3 + $UMULH $t3,$m3,$mi + adde $acc3,$acc4,$carry + addze $carry,$zero + addc $acc0,$acc0,$t0 + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$t2 + adde $acc3,$acc3,$t3 + #addze $carry,$carry + bne .Loop_mul4x_1st_reduction + + $UCMP $ap_end,$ap + beq .Lmul4x4_post_condition + + $LD $a0,$SIZE_T*1($ap) # a[4..7] + $LD $a1,$SIZE_T*2($ap) + $LD $a2,$SIZE_T*3($ap) + $LDU $a3,$SIZE_T*4($ap) + $LD $mi,$SIZE_T*8($sp) # a[0]*n0 + $LD $m0,$SIZE_T*1($np) # n[4..7] + $LD $m1,$SIZE_T*2($np) + $LD $m2,$SIZE_T*3($np) + $LDU $m3,$SIZE_T*4($np) + b .Loop_mul4x_1st_tail + +.align 5 +.Loop_mul4x_1st_tail: + $UMULL $t0,$a0,$bi # lo(a[4..7]*b[i]) + addze $carry,$carry # modulo-scheduled + $UMULL $t1,$a1,$bi + addi $cnt,$cnt,$SIZE_T + $UMULL $t2,$a2,$bi + andi. $cnt,$cnt,$SIZE_T*4-1 + $UMULL $t3,$a3,$bi + addc $acc0,$acc0,$t0 + $UMULH $t0,$a0,$bi # hi(a[4..7]*b[i]) + adde $acc1,$acc1,$t1 + $UMULH $t1,$a1,$bi + adde $acc2,$acc2,$t2 + $UMULH $t2,$a2,$bi + adde $acc3,$acc3,$t3 + $UMULH $t3,$a3,$bi + addze $acc4,$zero + $LDX $bi,$bp,$cnt # next b[i] (or b[0]) + addc $acc1,$acc1,$t0 + $UMULL $t0,$m0,$mi # lo(n[4..7]*a[0]*n0) + adde $acc2,$acc2,$t1 + $UMULL $t1,$m1,$mi + adde $acc3,$acc3,$t2 + $UMULL $t2,$m2,$mi + adde $acc4,$acc4,$t3 # can't overflow + $UMULL $t3,$m3,$mi + addc $acc0,$acc0,$t0 + $UMULH $t0,$m0,$mi # hi(n[4..7]*a[0]*n0) + adde $acc1,$acc1,$t1 + $UMULH $t1,$m1,$mi + adde $acc2,$acc2,$t2 + $UMULH $t2,$m2,$mi + adde $acc3,$acc3,$t3 + adde $acc4,$acc4,$carry + $UMULH $t3,$m3,$mi + addze $carry,$zero + addi $mi,$sp,$SIZE_T*8 + $LDX $mi,$mi,$cnt # next t[0]*n0 + $STU $acc0,$SIZE_T($tp) # word of result + addc $acc0,$acc1,$t0 + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 + adde $acc3,$acc4,$t3 + #addze $carry,$carry + bne .Loop_mul4x_1st_tail + + sub $t1,$ap_end,$num # rewinded $ap + $UCMP $ap_end,$ap # done yet? + beq .Lmul4x_proceed + + $LD $a0,$SIZE_T*1($ap) + $LD $a1,$SIZE_T*2($ap) + $LD $a2,$SIZE_T*3($ap) + $LDU $a3,$SIZE_T*4($ap) + $LD $m0,$SIZE_T*1($np) + $LD $m1,$SIZE_T*2($np) + $LD $m2,$SIZE_T*3($np) + $LDU $m3,$SIZE_T*4($np) + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + $LDU $bi,$SIZE_T*4($bp) # *++b + addze $carry,$carry # topmost carry + $LD $a0,$SIZE_T*1($t1) + $LD $a1,$SIZE_T*2($t1) + $LD $a2,$SIZE_T*3($t1) + $LD $a3,$SIZE_T*4($t1) + addi $ap,$t1,$SIZE_T*4 + sub $np,$np,$num # rewind np + + $ST $acc0,$SIZE_T*1($tp) # result + $ST $acc1,$SIZE_T*2($tp) + $ST $acc2,$SIZE_T*3($tp) + $ST $acc3,$SIZE_T*4($tp) + $ST $carry,$SIZE_T*5($tp) # save topmost carry + $LD $acc0,$SIZE_T*12($sp) # t[0..3] + $LD $acc1,$SIZE_T*13($sp) + $LD $acc2,$SIZE_T*14($sp) + $LD $acc3,$SIZE_T*15($sp) + + $LD $m0,$SIZE_T*1($np) # n[0..3] + $LD $m1,$SIZE_T*2($np) + $LD $m2,$SIZE_T*3($np) + $LDU $m3,$SIZE_T*4($np) + addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit + li $carry,0 + b .Loop_mul4x_reduction + +.align 5 +.Loop_mul4x_reduction: + $UMULL $t0,$a0,$bi # lo(a[0..3]*b[4]) + addze $carry,$carry # modulo-scheduled + $UMULL $t1,$a1,$bi + addi $cnt,$cnt,$SIZE_T + $UMULL $t2,$a2,$bi + andi. $cnt,$cnt,$SIZE_T*4-1 + $UMULL $t3,$a3,$bi + addc $acc0,$acc0,$t0 + $UMULH $t0,$a0,$bi # hi(a[0..3]*b[4]) + adde $acc1,$acc1,$t1 + $UMULH $t1,$a1,$bi + adde $acc2,$acc2,$t2 + $UMULL $mi,$acc0,$n0 # t[0]*n0 + adde $acc3,$acc3,$t3 + $UMULH $t2,$a2,$bi + addze $acc4,$zero + $UMULH $t3,$a3,$bi + $LDX $bi,$bp,$cnt # next b[i] + addc $acc1,$acc1,$t0 + # (*) mul $t0,$m0,$mi + $STU $mi,$SIZE_T($tp) # put aside t[0]*n0 for tail processing + adde $acc2,$acc2,$t1 + $UMULL $t1,$m1,$mi # lo(n[0..3]*t[0]*n0 + adde $acc3,$acc3,$t2 + $UMULL $t2,$m2,$mi + adde $acc4,$acc4,$t3 # can't overflow + $UMULL $t3,$m3,$mi + # (*) addc $acc0,$acc0,$t0 + addic $acc0,$acc0,-1 # (*), discarded + $UMULH $t0,$m0,$mi # hi(n[0..3]*t[0]*n0 + adde $acc0,$acc1,$t1 + $UMULH $t1,$m1,$mi + adde $acc1,$acc2,$t2 + $UMULH $t2,$m2,$mi + adde $acc2,$acc3,$t3 + $UMULH $t3,$m3,$mi + adde $acc3,$acc4,$carry + addze $carry,$zero + addc $acc0,$acc0,$t0 + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$t2 + adde $acc3,$acc3,$t3 + #addze $carry,$carry + bne .Loop_mul4x_reduction + + $LD $t0,$SIZE_T*5($tp) # t[4..7] + addze $carry,$carry + $LD $t1,$SIZE_T*6($tp) + $LD $t2,$SIZE_T*7($tp) + $LD $t3,$SIZE_T*8($tp) + $LD $a0,$SIZE_T*1($ap) # a[4..7] + $LD $a1,$SIZE_T*2($ap) + $LD $a2,$SIZE_T*3($ap) + $LDU $a3,$SIZE_T*4($ap) + addc $acc0,$acc0,$t0 + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$t2 + adde $acc3,$acc3,$t3 + #addze $carry,$carry + + $LD $mi,$SIZE_T*8($sp) # t[0]*n0 + $LD $m0,$SIZE_T*1($np) # n[4..7] + $LD $m1,$SIZE_T*2($np) + $LD $m2,$SIZE_T*3($np) + $LDU $m3,$SIZE_T*4($np) + b .Loop_mul4x_tail + +.align 5 +.Loop_mul4x_tail: + $UMULL $t0,$a0,$bi # lo(a[4..7]*b[4]) + addze $carry,$carry # modulo-scheduled + $UMULL $t1,$a1,$bi + addi $cnt,$cnt,$SIZE_T + $UMULL $t2,$a2,$bi + andi. $cnt,$cnt,$SIZE_T*4-1 + $UMULL $t3,$a3,$bi + addc $acc0,$acc0,$t0 + $UMULH $t0,$a0,$bi # hi(a[4..7]*b[4]) + adde $acc1,$acc1,$t1 + $UMULH $t1,$a1,$bi + adde $acc2,$acc2,$t2 + $UMULH $t2,$a2,$bi + adde $acc3,$acc3,$t3 + $UMULH $t3,$a3,$bi + addze $acc4,$zero + $LDX $bi,$bp,$cnt # next b[i] + addc $acc1,$acc1,$t0 + $UMULL $t0,$m0,$mi # lo(n[4..7]*t[0]*n0) + adde $acc2,$acc2,$t1 + $UMULL $t1,$m1,$mi + adde $acc3,$acc3,$t2 + $UMULL $t2,$m2,$mi + adde $acc4,$acc4,$t3 # can't overflow + $UMULL $t3,$m3,$mi + addc $acc0,$acc0,$t0 + $UMULH $t0,$m0,$mi # hi(n[4..7]*t[0]*n0) + adde $acc1,$acc1,$t1 + $UMULH $t1,$m1,$mi + adde $acc2,$acc2,$t2 + $UMULH $t2,$m2,$mi + adde $acc3,$acc3,$t3 + $UMULH $t3,$m3,$mi + adde $acc4,$acc4,$carry + addi $mi,$sp,$SIZE_T*8 + $LDX $mi,$mi,$cnt # next a[0]*n0 + addze $carry,$zero + $STU $acc0,$SIZE_T($tp) # word of result + addc $acc0,$acc1,$t0 + adde $acc1,$acc2,$t1 + adde $acc2,$acc3,$t2 + adde $acc3,$acc4,$t3 + #addze $carry,$carry + bne .Loop_mul4x_tail + + $LD $t0,$SIZE_T*5($tp) # next t[i] or topmost carry + sub $t1,$np,$num # rewinded np? + addze $carry,$carry + $UCMP $ap_end,$ap # done yet? + beq .Loop_mul4x_break + + $LD $t1,$SIZE_T*6($tp) + $LD $t2,$SIZE_T*7($tp) + $LD $t3,$SIZE_T*8($tp) + $LD $a0,$SIZE_T*1($ap) + $LD $a1,$SIZE_T*2($ap) + $LD $a2,$SIZE_T*3($ap) + $LDU $a3,$SIZE_T*4($ap) + addc $acc0,$acc0,$t0 + adde $acc1,$acc1,$t1 + adde $acc2,$acc2,$t2 + adde $acc3,$acc3,$t3 + #addze $carry,$carry + + $LD $m0,$SIZE_T*1($np) # n[4..7] + $LD $m1,$SIZE_T*2($np) + $LD $m2,$SIZE_T*3($np) + $LDU $m3,$SIZE_T*4($np) + b .Loop_mul4x_tail + +.align 5 +.Loop_mul4x_break: + $POP $t2,$SIZE_T*6($sp) # pull rp and &b[num-4] + $POP $t3,$SIZE_T*7($sp) + addc $a0,$acc0,$t0 # accumulate topmost carry + $LD $acc0,$SIZE_T*12($sp) # t[0..3] + addze $a1,$acc1 + $LD $acc1,$SIZE_T*13($sp) + addze $a2,$acc2 + $LD $acc2,$SIZE_T*14($sp) + addze $a3,$acc3 + $LD $acc3,$SIZE_T*15($sp) + addze $carry,$carry # topmost carry + $ST $a0,$SIZE_T*1($tp) # result + sub $ap,$ap_end,$num # rewind ap + $ST $a1,$SIZE_T*2($tp) + $ST $a2,$SIZE_T*3($tp) + $ST $a3,$SIZE_T*4($tp) + $ST $carry,$SIZE_T*5($tp) # store topmost carry + + $LD $m0,$SIZE_T*1($t1) # n[0..3] + $LD $m1,$SIZE_T*2($t1) + $LD $m2,$SIZE_T*3($t1) + $LD $m3,$SIZE_T*4($t1) + addi $np,$t1,$SIZE_T*4 + $UCMP $bp,$t3 # done yet? + beq .Lmul4x_post + + $LDU $bi,$SIZE_T*4($bp) + $LD $a0,$SIZE_T*1($ap) # a[0..3] + $LD $a1,$SIZE_T*2($ap) + $LD $a2,$SIZE_T*3($ap) + $LDU $a3,$SIZE_T*4($ap) + li $carry,0 + addic $tp,$sp,$SIZE_T*7 # &t[-1], clear carry bit + b .Loop_mul4x_reduction + +.align 5 +.Lmul4x_post: + # Final step. We see if result is larger than modulus, and + # if it is, subtract the modulus. But comparison implies + # subtraction. So we subtract modulus, see if it borrowed, + # and conditionally copy original value. + srwi $cnt,$num,`log($SIZE_T)/log(2)+2` + mr $bp,$t2 # &rp[-1] + subi $cnt,$cnt,1 + mr $ap_end,$t2 # &rp[-1] copy + subfc $t0,$m0,$acc0 + addi $tp,$sp,$SIZE_T*15 + subfe $t1,$m1,$acc1 + + mtctr $cnt +.Lmul4x_sub: + $LD $m0,$SIZE_T*1($np) + $LD $acc0,$SIZE_T*1($tp) + subfe $t2,$m2,$acc2 + $LD $m1,$SIZE_T*2($np) + $LD $acc1,$SIZE_T*2($tp) + subfe $t3,$m3,$acc3 + $LD $m2,$SIZE_T*3($np) + $LD $acc2,$SIZE_T*3($tp) + $LDU $m3,$SIZE_T*4($np) + $LDU $acc3,$SIZE_T*4($tp) + $ST $t0,$SIZE_T*1($bp) + $ST $t1,$SIZE_T*2($bp) + subfe $t0,$m0,$acc0 + $ST $t2,$SIZE_T*3($bp) + $STU $t3,$SIZE_T*4($bp) + subfe $t1,$m1,$acc1 + bdnz .Lmul4x_sub + + $LD $a0,$SIZE_T*1($ap_end) + $ST $t0,$SIZE_T*1($bp) + $LD $t0,$SIZE_T*12($sp) + subfe $t2,$m2,$acc2 + $LD $a1,$SIZE_T*2($ap_end) + $ST $t1,$SIZE_T*2($bp) + $LD $t1,$SIZE_T*13($sp) + subfe $t3,$m3,$acc3 + subfe $carry,$zero,$carry # did it borrow? + addi $tp,$sp,$SIZE_T*12 + $LD $a2,$SIZE_T*3($ap_end) + $ST $t2,$SIZE_T*3($bp) + $LD $t2,$SIZE_T*14($sp) + $LD $a3,$SIZE_T*4($ap_end) + $ST $t3,$SIZE_T*4($bp) + $LD $t3,$SIZE_T*15($sp) + + mtctr $cnt +.Lmul4x_cond_copy: + and $t0,$t0,$carry + andc $a0,$a0,$carry + $ST $zero,$SIZE_T*0($tp) # wipe stack clean + and $t1,$t1,$carry + andc $a1,$a1,$carry + $ST $zero,$SIZE_T*1($tp) + and $t2,$t2,$carry + andc $a2,$a2,$carry + $ST $zero,$SIZE_T*2($tp) + and $t3,$t3,$carry + andc $a3,$a3,$carry + $ST $zero,$SIZE_T*3($tp) + or $acc0,$t0,$a0 + $LD $a0,$SIZE_T*5($ap_end) + $LD $t0,$SIZE_T*4($tp) + or $acc1,$t1,$a1 + $LD $a1,$SIZE_T*6($ap_end) + $LD $t1,$SIZE_T*5($tp) + or $acc2,$t2,$a2 + $LD $a2,$SIZE_T*7($ap_end) + $LD $t2,$SIZE_T*6($tp) + or $acc3,$t3,$a3 + $LD $a3,$SIZE_T*8($ap_end) + $LD $t3,$SIZE_T*7($tp) + addi $tp,$tp,$SIZE_T*4 + $ST $acc0,$SIZE_T*1($ap_end) + $ST $acc1,$SIZE_T*2($ap_end) + $ST $acc2,$SIZE_T*3($ap_end) + $STU $acc3,$SIZE_T*4($ap_end) + bdnz .Lmul4x_cond_copy + + $POP $bp,0($sp) # pull saved sp + and $t0,$t0,$carry + andc $a0,$a0,$carry + $ST $zero,$SIZE_T*0($tp) + and $t1,$t1,$carry + andc $a1,$a1,$carry + $ST $zero,$SIZE_T*1($tp) + and $t2,$t2,$carry + andc $a2,$a2,$carry + $ST $zero,$SIZE_T*2($tp) + and $t3,$t3,$carry + andc $a3,$a3,$carry + $ST $zero,$SIZE_T*3($tp) + or $acc0,$t0,$a0 + or $acc1,$t1,$a1 + $ST $zero,$SIZE_T*4($tp) + or $acc2,$t2,$a2 + or $acc3,$t3,$a3 + $ST $acc0,$SIZE_T*1($ap_end) + $ST $acc1,$SIZE_T*2($ap_end) + $ST $acc2,$SIZE_T*3($ap_end) + $ST $acc3,$SIZE_T*4($ap_end) + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + $POP $ap,$SIZE_T*6($sp) # pull &rp[-1] + $POP $bp,0($sp) # pull saved sp + addze $carry,$carry # modulo-scheduled + # $acc0-3,$carry hold result, $m0-3 hold modulus + subfc $a0,$m0,$acc0 + subfe $a1,$m1,$acc1 + subfe $a2,$m2,$acc2 + subfe $a3,$m3,$acc3 + subfe $carry,$zero,$carry # did it borrow? + + and $m0,$m0,$carry + and $m1,$m1,$carry + addc $a0,$a0,$m0 + and $m2,$m2,$carry + adde $a1,$a1,$m1 + and $m3,$m3,$carry + adde $a2,$a2,$m2 + adde $a3,$a3,$m3 + + $ST $a0,$SIZE_T*1($ap) # write result + $ST $a1,$SIZE_T*2($ap) + $ST $a2,$SIZE_T*3($ap) + $ST $a3,$SIZE_T*4($ap) + +.Lmul4x_done: + $ST $zero,$SIZE_T*8($sp) # wipe stack clean + $ST $zero,$SIZE_T*9($sp) + $ST $zero,$SIZE_T*10($sp) + $ST $zero,$SIZE_T*11($sp) + li r3,1 # signal "done" + $POP r14,-$SIZE_T*18($bp) + $POP r15,-$SIZE_T*17($bp) + $POP r16,-$SIZE_T*16($bp) + $POP r17,-$SIZE_T*15($bp) + $POP r18,-$SIZE_T*14($bp) + $POP r19,-$SIZE_T*13($bp) + $POP r20,-$SIZE_T*12($bp) + $POP r21,-$SIZE_T*11($bp) + $POP r22,-$SIZE_T*10($bp) + $POP r23,-$SIZE_T*9($bp) + $POP r24,-$SIZE_T*8($bp) + $POP r25,-$SIZE_T*7($bp) + $POP r26,-$SIZE_T*6($bp) + $POP r27,-$SIZE_T*5($bp) + $POP r28,-$SIZE_T*4($bp) + $POP r29,-$SIZE_T*3($bp) + $POP r30,-$SIZE_T*2($bp) + $POP r31,-$SIZE_T*1($bp) + mr $sp,$bp + blr + .long 0 + .byte 0,12,4,0x20,0x80,18,6,0 + .long 0 +.size .bn_mul4x_mont_int,.-.bn_mul4x_mont_int +___ +} + +if (1) { +######################################################################## +# Following is PPC adaptation of sqrx8x_mont from x86_64-mont5 module. + +my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("r$_",(9..12,14..17)); +my ($t0,$t1,$t2,$t3)=map("r$_",(18..21)); +my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("r$_",(22..29)); +my ($cnt,$carry,$zero)=("r30","r31","r0"); +my ($tp,$ap_end,$na0)=($bp,$np,$carry); + +# sp----------->+-------------------------------+ +# | saved sp | +# +-------------------------------+ +# . . +# +12*size_t +-------------------------------+ +# | size_t tmp[2*num] | +# . . +# . . +# . . +# +-------------------------------+ +# . . +# -18*size_t +-------------------------------+ +# | 18 saved gpr, r14-r31 | +# . . +# . . +# +-------------------------------+ +$code.=<<___; +.align 5 +__bn_sqr8x_mont: +.Lsqr8x_do: + mr $a0,$sp + slwi $a1,$num,`log($SIZE_T)/log(2)+1` + li $a2,-32*$SIZE_T + sub $a1,$a2,$a1 + slwi $num,$num,`log($SIZE_T)/log(2)` + $STUX $sp,$sp,$a1 # alloca + + $PUSH r14,-$SIZE_T*18($a0) + $PUSH r15,-$SIZE_T*17($a0) + $PUSH r16,-$SIZE_T*16($a0) + $PUSH r17,-$SIZE_T*15($a0) + $PUSH r18,-$SIZE_T*14($a0) + $PUSH r19,-$SIZE_T*13($a0) + $PUSH r20,-$SIZE_T*12($a0) + $PUSH r21,-$SIZE_T*11($a0) + $PUSH r22,-$SIZE_T*10($a0) + $PUSH r23,-$SIZE_T*9($a0) + $PUSH r24,-$SIZE_T*8($a0) + $PUSH r25,-$SIZE_T*7($a0) + $PUSH r26,-$SIZE_T*6($a0) + $PUSH r27,-$SIZE_T*5($a0) + $PUSH r28,-$SIZE_T*4($a0) + $PUSH r29,-$SIZE_T*3($a0) + $PUSH r30,-$SIZE_T*2($a0) + $PUSH r31,-$SIZE_T*1($a0) + + subi $ap,$ap,$SIZE_T # bias by -1 + subi $t0,$np,$SIZE_T # bias by -1 + subi $rp,$rp,$SIZE_T # bias by -1 + $LD $n0,0($n0) # *n0 + li $zero,0 + + add $ap_end,$ap,$num + $LD $a0,$SIZE_T*1($ap) + #li $acc0,0 + $LD $a1,$SIZE_T*2($ap) + li $acc1,0 + $LD $a2,$SIZE_T*3($ap) + li $acc2,0 + $LD $a3,$SIZE_T*4($ap) + li $acc3,0 + $LD $a4,$SIZE_T*5($ap) + li $acc4,0 + $LD $a5,$SIZE_T*6($ap) + li $acc5,0 + $LD $a6,$SIZE_T*7($ap) + li $acc6,0 + $LDU $a7,$SIZE_T*8($ap) + li $acc7,0 + + addi $tp,$sp,$SIZE_T*11 # &tp[-1] + subic. $cnt,$num,$SIZE_T*8 + b .Lsqr8x_zero_start + +.align 5 +.Lsqr8x_zero: + subic. $cnt,$cnt,$SIZE_T*8 + $ST $zero,$SIZE_T*1($tp) + $ST $zero,$SIZE_T*2($tp) + $ST $zero,$SIZE_T*3($tp) + $ST $zero,$SIZE_T*4($tp) + $ST $zero,$SIZE_T*5($tp) + $ST $zero,$SIZE_T*6($tp) + $ST $zero,$SIZE_T*7($tp) + $ST $zero,$SIZE_T*8($tp) +.Lsqr8x_zero_start: + $ST $zero,$SIZE_T*9($tp) + $ST $zero,$SIZE_T*10($tp) + $ST $zero,$SIZE_T*11($tp) + $ST $zero,$SIZE_T*12($tp) + $ST $zero,$SIZE_T*13($tp) + $ST $zero,$SIZE_T*14($tp) + $ST $zero,$SIZE_T*15($tp) + $STU $zero,$SIZE_T*16($tp) + bne .Lsqr8x_zero + + $PUSH $rp,$SIZE_T*6($sp) # offload &rp[-1] + $PUSH $t0,$SIZE_T*7($sp) # offload &np[-1] + $PUSH $n0,$SIZE_T*8($sp) # offload n0 + $PUSH $tp,$SIZE_T*9($sp) # &tp[2*num-1] + $PUSH $zero,$SIZE_T*10($sp) # initial top-most carry + addi $tp,$sp,$SIZE_T*11 # &tp[-1] + + # Multiply everything but a[i]*a[i] +.align 5 +.Lsqr8x_outer_loop: + # a[1]a[0] (i) + # a[2]a[0] + # a[3]a[0] + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[2]a[1] (ii) + # a[3]a[1] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[3]a[2] (iii) + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] (iv) + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # a[5]a[4] (v) + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] (vi) + # a[7]a[5] + # a[7]a[6] (vii) + + $UMULL $t0,$a1,$a0 # lo(a[1..7]*a[0]) (i) + $UMULL $t1,$a2,$a0 + $UMULL $t2,$a3,$a0 + $UMULL $t3,$a4,$a0 + addc $acc1,$acc1,$t0 # t[1]+lo(a[1]*a[0]) + $UMULL $t0,$a5,$a0 + adde $acc2,$acc2,$t1 + $UMULL $t1,$a6,$a0 + adde $acc3,$acc3,$t2 + $UMULL $t2,$a7,$a0 + adde $acc4,$acc4,$t3 + $UMULH $t3,$a1,$a0 # hi(a[1..7]*a[0]) + adde $acc5,$acc5,$t0 + $UMULH $t0,$a2,$a0 + adde $acc6,$acc6,$t1 + $UMULH $t1,$a3,$a0 + adde $acc7,$acc7,$t2 + $UMULH $t2,$a4,$a0 + $ST $acc0,$SIZE_T*1($tp) # t[0] + addze $acc0,$zero # t[8] + $ST $acc1,$SIZE_T*2($tp) # t[1] + addc $acc2,$acc2,$t3 # t[2]+lo(a[1]*a[0]) + $UMULH $t3,$a5,$a0 + adde $acc3,$acc3,$t0 + $UMULH $t0,$a6,$a0 + adde $acc4,$acc4,$t1 + $UMULH $t1,$a7,$a0 + adde $acc5,$acc5,$t2 + $UMULL $t2,$a2,$a1 # lo(a[2..7]*a[1]) (ii) + adde $acc6,$acc6,$t3 + $UMULL $t3,$a3,$a1 + adde $acc7,$acc7,$t0 + $UMULL $t0,$a4,$a1 + adde $acc0,$acc0,$t1 + + $UMULL $t1,$a5,$a1 + addc $acc3,$acc3,$t2 + $UMULL $t2,$a6,$a1 + adde $acc4,$acc4,$t3 + $UMULL $t3,$a7,$a1 + adde $acc5,$acc5,$t0 + $UMULH $t0,$a2,$a1 # hi(a[2..7]*a[1]) + adde $acc6,$acc6,$t1 + $UMULH $t1,$a3,$a1 + adde $acc7,$acc7,$t2 + $UMULH $t2,$a4,$a1 + adde $acc0,$acc0,$t3 + $UMULH $t3,$a5,$a1 + $ST $acc2,$SIZE_T*3($tp) # t[2] + addze $acc1,$zero # t[9] + $ST $acc3,$SIZE_T*4($tp) # t[3] + addc $acc4,$acc4,$t0 + $UMULH $t0,$a6,$a1 + adde $acc5,$acc5,$t1 + $UMULH $t1,$a7,$a1 + adde $acc6,$acc6,$t2 + $UMULL $t2,$a3,$a2 # lo(a[3..7]*a[2]) (iii) + adde $acc7,$acc7,$t3 + $UMULL $t3,$a4,$a2 + adde $acc0,$acc0,$t0 + $UMULL $t0,$a5,$a2 + adde $acc1,$acc1,$t1 + + $UMULL $t1,$a6,$a2 + addc $acc5,$acc5,$t2 + $UMULL $t2,$a7,$a2 + adde $acc6,$acc6,$t3 + $UMULH $t3,$a3,$a2 # hi(a[3..7]*a[2]) + adde $acc7,$acc7,$t0 + $UMULH $t0,$a4,$a2 + adde $acc0,$acc0,$t1 + $UMULH $t1,$a5,$a2 + adde $acc1,$acc1,$t2 + $UMULH $t2,$a6,$a2 + $ST $acc4,$SIZE_T*5($tp) # t[4] + addze $acc2,$zero # t[10] + $ST $acc5,$SIZE_T*6($tp) # t[5] + addc $acc6,$acc6,$t3 + $UMULH $t3,$a7,$a2 + adde $acc7,$acc7,$t0 + $UMULL $t0,$a4,$a3 # lo(a[4..7]*a[3]) (iv) + adde $acc0,$acc0,$t1 + $UMULL $t1,$a5,$a3 + adde $acc1,$acc1,$t2 + $UMULL $t2,$a6,$a3 + adde $acc2,$acc2,$t3 + + $UMULL $t3,$a7,$a3 + addc $acc7,$acc7,$t0 + $UMULH $t0,$a4,$a3 # hi(a[4..7]*a[3]) + adde $acc0,$acc0,$t1 + $UMULH $t1,$a5,$a3 + adde $acc1,$acc1,$t2 + $UMULH $t2,$a6,$a3 + adde $acc2,$acc2,$t3 + $UMULH $t3,$a7,$a3 + $ST $acc6,$SIZE_T*7($tp) # t[6] + addze $acc3,$zero # t[11] + $STU $acc7,$SIZE_T*8($tp) # t[7] + addc $acc0,$acc0,$t0 + $UMULL $t0,$a5,$a4 # lo(a[5..7]*a[4]) (v) + adde $acc1,$acc1,$t1 + $UMULL $t1,$a6,$a4 + adde $acc2,$acc2,$t2 + $UMULL $t2,$a7,$a4 + adde $acc3,$acc3,$t3 + + $UMULH $t3,$a5,$a4 # hi(a[5..7]*a[4]) + addc $acc1,$acc1,$t0 + $UMULH $t0,$a6,$a4 + adde $acc2,$acc2,$t1 + $UMULH $t1,$a7,$a4 + adde $acc3,$acc3,$t2 + $UMULL $t2,$a6,$a5 # lo(a[6..7]*a[5]) (vi) + addze $acc4,$zero # t[12] + addc $acc2,$acc2,$t3 + $UMULL $t3,$a7,$a5 + adde $acc3,$acc3,$t0 + $UMULH $t0,$a6,$a5 # hi(a[6..7]*a[5]) + adde $acc4,$acc4,$t1 + + $UMULH $t1,$a7,$a5 + addc $acc3,$acc3,$t2 + $UMULL $t2,$a7,$a6 # lo(a[7]*a[6]) (vii) + adde $acc4,$acc4,$t3 + $UMULH $t3,$a7,$a6 # hi(a[7]*a[6]) + addze $acc5,$zero # t[13] + addc $acc4,$acc4,$t0 + $UCMP $ap_end,$ap # done yet? + adde $acc5,$acc5,$t1 + + addc $acc5,$acc5,$t2 + sub $t0,$ap_end,$num # rewinded ap + addze $acc6,$zero # t[14] + add $acc6,$acc6,$t3 + + beq .Lsqr8x_outer_break + + mr $n0,$a0 + $LD $a0,$SIZE_T*1($tp) + $LD $a1,$SIZE_T*2($tp) + $LD $a2,$SIZE_T*3($tp) + $LD $a3,$SIZE_T*4($tp) + $LD $a4,$SIZE_T*5($tp) + $LD $a5,$SIZE_T*6($tp) + $LD $a6,$SIZE_T*7($tp) + $LD $a7,$SIZE_T*8($tp) + addc $acc0,$acc0,$a0 + $LD $a0,$SIZE_T*1($ap) + adde $acc1,$acc1,$a1 + $LD $a1,$SIZE_T*2($ap) + adde $acc2,$acc2,$a2 + $LD $a2,$SIZE_T*3($ap) + adde $acc3,$acc3,$a3 + $LD $a3,$SIZE_T*4($ap) + adde $acc4,$acc4,$a4 + $LD $a4,$SIZE_T*5($ap) + adde $acc5,$acc5,$a5 + $LD $a5,$SIZE_T*6($ap) + adde $acc6,$acc6,$a6 + $LD $a6,$SIZE_T*7($ap) + subi $rp,$ap,$SIZE_T*7 + addze $acc7,$a7 + $LDU $a7,$SIZE_T*8($ap) + #addze $carry,$zero # moved below + li $cnt,0 + b .Lsqr8x_mul + + # a[8]a[0] + # a[9]a[0] + # a[a]a[0] + # a[b]a[0] + # a[c]a[0] + # a[d]a[0] + # a[e]a[0] + # a[f]a[0] + # a[8]a[1] + # a[f]a[1]........................ + # a[8]a[2] + # a[f]a[2]........................ + # a[8]a[3] + # a[f]a[3]........................ + # a[8]a[4] + # a[f]a[4]........................ + # a[8]a[5] + # a[f]a[5]........................ + # a[8]a[6] + # a[f]a[6]........................ + # a[8]a[7] + # a[f]a[7]........................ +.align 5 +.Lsqr8x_mul: + $UMULL $t0,$a0,$n0 + addze $carry,$zero # carry bit, modulo-scheduled + $UMULL $t1,$a1,$n0 + addi $cnt,$cnt,$SIZE_T + $UMULL $t2,$a2,$n0 + andi. $cnt,$cnt,$SIZE_T*8-1 + $UMULL $t3,$a3,$n0 + addc $acc0,$acc0,$t0 + $UMULL $t0,$a4,$n0 + adde $acc1,$acc1,$t1 + $UMULL $t1,$a5,$n0 + adde $acc2,$acc2,$t2 + $UMULL $t2,$a6,$n0 + adde $acc3,$acc3,$t3 + $UMULL $t3,$a7,$n0 + adde $acc4,$acc4,$t0 + $UMULH $t0,$a0,$n0 + adde $acc5,$acc5,$t1 + $UMULH $t1,$a1,$n0 + adde $acc6,$acc6,$t2 + $UMULH $t2,$a2,$n0 + adde $acc7,$acc7,$t3 + $UMULH $t3,$a3,$n0 + addze $carry,$carry + $STU $acc0,$SIZE_T($tp) + addc $acc0,$acc1,$t0 + $UMULH $t0,$a4,$n0 + adde $acc1,$acc2,$t1 + $UMULH $t1,$a5,$n0 + adde $acc2,$acc3,$t2 + $UMULH $t2,$a6,$n0 + adde $acc3,$acc4,$t3 + $UMULH $t3,$a7,$n0 + $LDX $n0,$rp,$cnt + adde $acc4,$acc5,$t0 + adde $acc5,$acc6,$t1 + adde $acc6,$acc7,$t2 + adde $acc7,$carry,$t3 + #addze $carry,$zero # moved above + bne .Lsqr8x_mul + # note that carry flag is guaranteed + # to be zero at this point + $UCMP $ap,$ap_end # done yet? + beq .Lsqr8x_break + + $LD $a0,$SIZE_T*1($tp) + $LD $a1,$SIZE_T*2($tp) + $LD $a2,$SIZE_T*3($tp) + $LD $a3,$SIZE_T*4($tp) + $LD $a4,$SIZE_T*5($tp) + $LD $a5,$SIZE_T*6($tp) + $LD $a6,$SIZE_T*7($tp) + $LD $a7,$SIZE_T*8($tp) + addc $acc0,$acc0,$a0 + $LD $a0,$SIZE_T*1($ap) + adde $acc1,$acc1,$a1 + $LD $a1,$SIZE_T*2($ap) + adde $acc2,$acc2,$a2 + $LD $a2,$SIZE_T*3($ap) + adde $acc3,$acc3,$a3 + $LD $a3,$SIZE_T*4($ap) + adde $acc4,$acc4,$a4 + $LD $a4,$SIZE_T*5($ap) + adde $acc5,$acc5,$a5 + $LD $a5,$SIZE_T*6($ap) + adde $acc6,$acc6,$a6 + $LD $a6,$SIZE_T*7($ap) + adde $acc7,$acc7,$a7 + $LDU $a7,$SIZE_T*8($ap) + #addze $carry,$zero # moved above + b .Lsqr8x_mul + +.align 5 +.Lsqr8x_break: + $LD $a0,$SIZE_T*8($rp) + addi $ap,$rp,$SIZE_T*15 + $LD $a1,$SIZE_T*9($rp) + sub. $t0,$ap_end,$ap # is it last iteration? + $LD $a2,$SIZE_T*10($rp) + sub $t1,$tp,$t0 + $LD $a3,$SIZE_T*11($rp) + $LD $a4,$SIZE_T*12($rp) + $LD $a5,$SIZE_T*13($rp) + $LD $a6,$SIZE_T*14($rp) + $LD $a7,$SIZE_T*15($rp) + beq .Lsqr8x_outer_loop + + $ST $acc0,$SIZE_T*1($tp) + $LD $acc0,$SIZE_T*1($t1) + $ST $acc1,$SIZE_T*2($tp) + $LD $acc1,$SIZE_T*2($t1) + $ST $acc2,$SIZE_T*3($tp) + $LD $acc2,$SIZE_T*3($t1) + $ST $acc3,$SIZE_T*4($tp) + $LD $acc3,$SIZE_T*4($t1) + $ST $acc4,$SIZE_T*5($tp) + $LD $acc4,$SIZE_T*5($t1) + $ST $acc5,$SIZE_T*6($tp) + $LD $acc5,$SIZE_T*6($t1) + $ST $acc6,$SIZE_T*7($tp) + $LD $acc6,$SIZE_T*7($t1) + $ST $acc7,$SIZE_T*8($tp) + $LD $acc7,$SIZE_T*8($t1) + mr $tp,$t1 + b .Lsqr8x_outer_loop + +.align 5 +.Lsqr8x_outer_break: + #################################################################### + # Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + $LD $a1,$SIZE_T*1($t0) # recall that $t0 is &a[-1] + $LD $a3,$SIZE_T*2($t0) + $LD $a5,$SIZE_T*3($t0) + $LD $a7,$SIZE_T*4($t0) + addi $ap,$t0,$SIZE_T*4 + # "tp[x]" comments are for num==8 case + $LD $t1,$SIZE_T*13($sp) # =tp[1], t[0] is not interesting + $LD $t2,$SIZE_T*14($sp) + $LD $t3,$SIZE_T*15($sp) + $LD $t0,$SIZE_T*16($sp) + + $ST $acc0,$SIZE_T*1($tp) # tp[8]= + srwi $cnt,$num,`log($SIZE_T)/log(2)+2` + $ST $acc1,$SIZE_T*2($tp) + subi $cnt,$cnt,1 + $ST $acc2,$SIZE_T*3($tp) + $ST $acc3,$SIZE_T*4($tp) + $ST $acc4,$SIZE_T*5($tp) + $ST $acc5,$SIZE_T*6($tp) + $ST $acc6,$SIZE_T*7($tp) + #$ST $acc7,$SIZE_T*8($tp) # tp[15] is not interesting + addi $tp,$sp,$SIZE_T*11 # &tp[-1] + $UMULL $acc0,$a1,$a1 + $UMULH $a1,$a1,$a1 + add $acc1,$t1,$t1 # <<1 + $SHRI $t1,$t1,$BITS-1 + $UMULL $a2,$a3,$a3 + $UMULH $a3,$a3,$a3 + addc $acc1,$acc1,$a1 + add $acc2,$t2,$t2 + $SHRI $t2,$t2,$BITS-1 + add $acc3,$t3,$t3 + $SHRI $t3,$t3,$BITS-1 + or $acc2,$acc2,$t1 + + mtctr $cnt +.Lsqr4x_shift_n_add: + $UMULL $a4,$a5,$a5 + $UMULH $a5,$a5,$a5 + $LD $t1,$SIZE_T*6($tp) # =tp[5] + $LD $a1,$SIZE_T*1($ap) + adde $acc2,$acc2,$a2 + add $acc4,$t0,$t0 + $SHRI $t0,$t0,$BITS-1 + or $acc3,$acc3,$t2 + $LD $t2,$SIZE_T*7($tp) # =tp[6] + adde $acc3,$acc3,$a3 + $LD $a3,$SIZE_T*2($ap) + add $acc5,$t1,$t1 + $SHRI $t1,$t1,$BITS-1 + or $acc4,$acc4,$t3 + $LD $t3,$SIZE_T*8($tp) # =tp[7] + $UMULL $a6,$a7,$a7 + $UMULH $a7,$a7,$a7 + adde $acc4,$acc4,$a4 + add $acc6,$t2,$t2 + $SHRI $t2,$t2,$BITS-1 + or $acc5,$acc5,$t0 + $LD $t0,$SIZE_T*9($tp) # =tp[8] + adde $acc5,$acc5,$a5 + $LD $a5,$SIZE_T*3($ap) + add $acc7,$t3,$t3 + $SHRI $t3,$t3,$BITS-1 + or $acc6,$acc6,$t1 + $LD $t1,$SIZE_T*10($tp) # =tp[9] + $UMULL $a0,$a1,$a1 + $UMULH $a1,$a1,$a1 + adde $acc6,$acc6,$a6 + $ST $acc0,$SIZE_T*1($tp) # tp[0]= + add $acc0,$t0,$t0 + $SHRI $t0,$t0,$BITS-1 + or $acc7,$acc7,$t2 + $LD $t2,$SIZE_T*11($tp) # =tp[10] + adde $acc7,$acc7,$a7 + $LDU $a7,$SIZE_T*4($ap) + $ST $acc1,$SIZE_T*2($tp) # tp[1]= + add $acc1,$t1,$t1 + $SHRI $t1,$t1,$BITS-1 + or $acc0,$acc0,$t3 + $LD $t3,$SIZE_T*12($tp) # =tp[11] + $UMULL $a2,$a3,$a3 + $UMULH $a3,$a3,$a3 + adde $acc0,$acc0,$a0 + $ST $acc2,$SIZE_T*3($tp) # tp[2]= + add $acc2,$t2,$t2 + $SHRI $t2,$t2,$BITS-1 + or $acc1,$acc1,$t0 + $LD $t0,$SIZE_T*13($tp) # =tp[12] + adde $acc1,$acc1,$a1 + $ST $acc3,$SIZE_T*4($tp) # tp[3]= + $ST $acc4,$SIZE_T*5($tp) # tp[4]= + $ST $acc5,$SIZE_T*6($tp) # tp[5]= + $ST $acc6,$SIZE_T*7($tp) # tp[6]= + $STU $acc7,$SIZE_T*8($tp) # tp[7]= + add $acc3,$t3,$t3 + $SHRI $t3,$t3,$BITS-1 + or $acc2,$acc2,$t1 + bdnz .Lsqr4x_shift_n_add +___ +my ($np,$np_end)=($ap,$ap_end); +$code.=<<___; + $POP $np,$SIZE_T*7($sp) # pull &np[-1] and n0 + $POP $n0,$SIZE_T*8($sp) + $UMULL $a4,$a5,$a5 + $UMULH $a5,$a5,$a5 + $ST $acc0,$SIZE_T*1($tp) # tp[8]= + $LD $acc0,$SIZE_T*12($sp) # =tp[0] + $LD $t1,$SIZE_T*6($tp) # =tp[13] + adde $acc2,$acc2,$a2 + add $acc4,$t0,$t0 + $SHRI $t0,$t0,$BITS-1 + or $acc3,$acc3,$t2 + $LD $t2,$SIZE_T*7($tp) # =tp[14] + adde $acc3,$acc3,$a3 + add $acc5,$t1,$t1 + $SHRI $t1,$t1,$BITS-1 + or $acc4,$acc4,$t3 + $UMULL $a6,$a7,$a7 + $UMULH $a7,$a7,$a7 + adde $acc4,$acc4,$a4 + add $acc6,$t2,$t2 + $SHRI $t2,$t2,$BITS-1 + or $acc5,$acc5,$t0 + $ST $acc1,$SIZE_T*2($tp) # tp[9]= + $LD $acc1,$SIZE_T*13($sp) # =tp[1] + adde $acc5,$acc5,$a5 + or $acc6,$acc6,$t1 + $LD $a0,$SIZE_T*1($np) + $LD $a1,$SIZE_T*2($np) + adde $acc6,$acc6,$a6 + $LD $a2,$SIZE_T*3($np) + $LD $a3,$SIZE_T*4($np) + adde $acc7,$a7,$t2 + $LD $a4,$SIZE_T*5($np) + $LD $a5,$SIZE_T*6($np) + + ################################################################ + # Reduce by 8 limbs per iteration + $UMULL $na0,$n0,$acc0 # t[0]*n0 + li $cnt,8 + $LD $a6,$SIZE_T*7($np) + add $np_end,$np,$num + $LDU $a7,$SIZE_T*8($np) + $ST $acc2,$SIZE_T*3($tp) # tp[10]= + $LD $acc2,$SIZE_T*14($sp) + $ST $acc3,$SIZE_T*4($tp) # tp[11]= + $LD $acc3,$SIZE_T*15($sp) + $ST $acc4,$SIZE_T*5($tp) # tp[12]= + $LD $acc4,$SIZE_T*16($sp) + $ST $acc5,$SIZE_T*6($tp) # tp[13]= + $LD $acc5,$SIZE_T*17($sp) + $ST $acc6,$SIZE_T*7($tp) # tp[14]= + $LD $acc6,$SIZE_T*18($sp) + $ST $acc7,$SIZE_T*8($tp) # tp[15]= + $LD $acc7,$SIZE_T*19($sp) + addi $tp,$sp,$SIZE_T*11 # &tp[-1] + mtctr $cnt + b .Lsqr8x_reduction + +.align 5 +.Lsqr8x_reduction: + # (*) $UMULL $t0,$a0,$na0 # lo(n[0-7])*lo(t[0]*n0) + $UMULL $t1,$a1,$na0 + $UMULL $t2,$a2,$na0 + $STU $na0,$SIZE_T($tp) # put aside t[0]*n0 for tail processing + $UMULL $t3,$a3,$na0 + # (*) addc $acc0,$acc0,$t0 + addic $acc0,$acc0,-1 # (*) + $UMULL $t0,$a4,$na0 + adde $acc0,$acc1,$t1 + $UMULL $t1,$a5,$na0 + adde $acc1,$acc2,$t2 + $UMULL $t2,$a6,$na0 + adde $acc2,$acc3,$t3 + $UMULL $t3,$a7,$na0 + adde $acc3,$acc4,$t0 + $UMULH $t0,$a0,$na0 # hi(n[0-7])*lo(t[0]*n0) + adde $acc4,$acc5,$t1 + $UMULH $t1,$a1,$na0 + adde $acc5,$acc6,$t2 + $UMULH $t2,$a2,$na0 + adde $acc6,$acc7,$t3 + $UMULH $t3,$a3,$na0 + addze $acc7,$zero + addc $acc0,$acc0,$t0 + $UMULH $t0,$a4,$na0 + adde $acc1,$acc1,$t1 + $UMULH $t1,$a5,$na0 + adde $acc2,$acc2,$t2 + $UMULH $t2,$a6,$na0 + adde $acc3,$acc3,$t3 + $UMULH $t3,$a7,$na0 + $UMULL $na0,$n0,$acc0 # next t[0]*n0 + adde $acc4,$acc4,$t0 + adde $acc5,$acc5,$t1 + adde $acc6,$acc6,$t2 + adde $acc7,$acc7,$t3 + bdnz .Lsqr8x_reduction + + $LD $t0,$SIZE_T*1($tp) + $LD $t1,$SIZE_T*2($tp) + $LD $t2,$SIZE_T*3($tp) + $LD $t3,$SIZE_T*4($tp) + subi $rp,$tp,$SIZE_T*7 + $UCMP $np_end,$np # done yet? + addc $acc0,$acc0,$t0 + $LD $t0,$SIZE_T*5($tp) + adde $acc1,$acc1,$t1 + $LD $t1,$SIZE_T*6($tp) + adde $acc2,$acc2,$t2 + $LD $t2,$SIZE_T*7($tp) + adde $acc3,$acc3,$t3 + $LD $t3,$SIZE_T*8($tp) + adde $acc4,$acc4,$t0 + adde $acc5,$acc5,$t1 + adde $acc6,$acc6,$t2 + adde $acc7,$acc7,$t3 + #addze $carry,$zero # moved below + beq .Lsqr8x8_post_condition + + $LD $n0,$SIZE_T*0($rp) + $LD $a0,$SIZE_T*1($np) + $LD $a1,$SIZE_T*2($np) + $LD $a2,$SIZE_T*3($np) + $LD $a3,$SIZE_T*4($np) + $LD $a4,$SIZE_T*5($np) + $LD $a5,$SIZE_T*6($np) + $LD $a6,$SIZE_T*7($np) + $LDU $a7,$SIZE_T*8($np) + li $cnt,0 + +.align 5 +.Lsqr8x_tail: + $UMULL $t0,$a0,$n0 + addze $carry,$zero # carry bit, modulo-scheduled + $UMULL $t1,$a1,$n0 + addi $cnt,$cnt,$SIZE_T + $UMULL $t2,$a2,$n0 + andi. $cnt,$cnt,$SIZE_T*8-1 + $UMULL $t3,$a3,$n0 + addc $acc0,$acc0,$t0 + $UMULL $t0,$a4,$n0 + adde $acc1,$acc1,$t1 + $UMULL $t1,$a5,$n0 + adde $acc2,$acc2,$t2 + $UMULL $t2,$a6,$n0 + adde $acc3,$acc3,$t3 + $UMULL $t3,$a7,$n0 + adde $acc4,$acc4,$t0 + $UMULH $t0,$a0,$n0 + adde $acc5,$acc5,$t1 + $UMULH $t1,$a1,$n0 + adde $acc6,$acc6,$t2 + $UMULH $t2,$a2,$n0 + adde $acc7,$acc7,$t3 + $UMULH $t3,$a3,$n0 + addze $carry,$carry + $STU $acc0,$SIZE_T($tp) + addc $acc0,$acc1,$t0 + $UMULH $t0,$a4,$n0 + adde $acc1,$acc2,$t1 + $UMULH $t1,$a5,$n0 + adde $acc2,$acc3,$t2 + $UMULH $t2,$a6,$n0 + adde $acc3,$acc4,$t3 + $UMULH $t3,$a7,$n0 + $LDX $n0,$rp,$cnt + adde $acc4,$acc5,$t0 + adde $acc5,$acc6,$t1 + adde $acc6,$acc7,$t2 + adde $acc7,$carry,$t3 + #addze $carry,$zero # moved above + bne .Lsqr8x_tail + # note that carry flag is guaranteed + # to be zero at this point + $LD $a0,$SIZE_T*1($tp) + $POP $carry,$SIZE_T*10($sp) # pull top-most carry in case we break + $UCMP $np_end,$np # done yet? + $LD $a1,$SIZE_T*2($tp) + sub $t2,$np_end,$num # rewinded np + $LD $a2,$SIZE_T*3($tp) + $LD $a3,$SIZE_T*4($tp) + $LD $a4,$SIZE_T*5($tp) + $LD $a5,$SIZE_T*6($tp) + $LD $a6,$SIZE_T*7($tp) + $LD $a7,$SIZE_T*8($tp) + beq .Lsqr8x_tail_break + + addc $acc0,$acc0,$a0 + $LD $a0,$SIZE_T*1($np) + adde $acc1,$acc1,$a1 + $LD $a1,$SIZE_T*2($np) + adde $acc2,$acc2,$a2 + $LD $a2,$SIZE_T*3($np) + adde $acc3,$acc3,$a3 + $LD $a3,$SIZE_T*4($np) + adde $acc4,$acc4,$a4 + $LD $a4,$SIZE_T*5($np) + adde $acc5,$acc5,$a5 + $LD $a5,$SIZE_T*6($np) + adde $acc6,$acc6,$a6 + $LD $a6,$SIZE_T*7($np) + adde $acc7,$acc7,$a7 + $LDU $a7,$SIZE_T*8($np) + #addze $carry,$zero # moved above + b .Lsqr8x_tail + +.align 5 +.Lsqr8x_tail_break: + $POP $n0,$SIZE_T*8($sp) # pull n0 + $POP $t3,$SIZE_T*9($sp) # &tp[2*num-1] + addi $cnt,$tp,$SIZE_T*8 # end of current t[num] window + + addic $carry,$carry,-1 # "move" top-most carry to carry bit + adde $t0,$acc0,$a0 + $LD $acc0,$SIZE_T*8($rp) + $LD $a0,$SIZE_T*1($t2) # recall that $t2 is &n[-1] + adde $t1,$acc1,$a1 + $LD $acc1,$SIZE_T*9($rp) + $LD $a1,$SIZE_T*2($t2) + adde $acc2,$acc2,$a2 + $LD $a2,$SIZE_T*3($t2) + adde $acc3,$acc3,$a3 + $LD $a3,$SIZE_T*4($t2) + adde $acc4,$acc4,$a4 + $LD $a4,$SIZE_T*5($t2) + adde $acc5,$acc5,$a5 + $LD $a5,$SIZE_T*6($t2) + adde $acc6,$acc6,$a6 + $LD $a6,$SIZE_T*7($t2) + adde $acc7,$acc7,$a7 + $LD $a7,$SIZE_T*8($t2) + addi $np,$t2,$SIZE_T*8 + addze $t2,$zero # top-most carry + $UMULL $na0,$n0,$acc0 + $ST $t0,$SIZE_T*1($tp) + $UCMP $cnt,$t3 # did we hit the bottom? + $ST $t1,$SIZE_T*2($tp) + li $cnt,8 + $ST $acc2,$SIZE_T*3($tp) + $LD $acc2,$SIZE_T*10($rp) + $ST $acc3,$SIZE_T*4($tp) + $LD $acc3,$SIZE_T*11($rp) + $ST $acc4,$SIZE_T*5($tp) + $LD $acc4,$SIZE_T*12($rp) + $ST $acc5,$SIZE_T*6($tp) + $LD $acc5,$SIZE_T*13($rp) + $ST $acc6,$SIZE_T*7($tp) + $LD $acc6,$SIZE_T*14($rp) + $ST $acc7,$SIZE_T*8($tp) + $LD $acc7,$SIZE_T*15($rp) + $PUSH $t2,$SIZE_T*10($sp) # off-load top-most carry + addi $tp,$rp,$SIZE_T*7 # slide the window + mtctr $cnt + bne .Lsqr8x_reduction + + ################################################################ + # Final step. We see if result is larger than modulus, and + # if it is, subtract the modulus. But comparison implies + # subtraction. So we subtract modulus, see if it borrowed, + # and conditionally copy original value. + $POP $rp,$SIZE_T*6($sp) # pull &rp[-1] + srwi $cnt,$num,`log($SIZE_T)/log(2)+3` + mr $n0,$tp # put tp aside + addi $tp,$tp,$SIZE_T*8 + subi $cnt,$cnt,1 + subfc $t0,$a0,$acc0 + subfe $t1,$a1,$acc1 + mr $carry,$t2 + mr $ap_end,$rp # $rp copy + + mtctr $cnt + b .Lsqr8x_sub + +.align 5 +.Lsqr8x_sub: + $LD $a0,$SIZE_T*1($np) + $LD $acc0,$SIZE_T*1($tp) + $LD $a1,$SIZE_T*2($np) + $LD $acc1,$SIZE_T*2($tp) + subfe $t2,$a2,$acc2 + $LD $a2,$SIZE_T*3($np) + $LD $acc2,$SIZE_T*3($tp) + subfe $t3,$a3,$acc3 + $LD $a3,$SIZE_T*4($np) + $LD $acc3,$SIZE_T*4($tp) + $ST $t0,$SIZE_T*1($rp) + subfe $t0,$a4,$acc4 + $LD $a4,$SIZE_T*5($np) + $LD $acc4,$SIZE_T*5($tp) + $ST $t1,$SIZE_T*2($rp) + subfe $t1,$a5,$acc5 + $LD $a5,$SIZE_T*6($np) + $LD $acc5,$SIZE_T*6($tp) + $ST $t2,$SIZE_T*3($rp) + subfe $t2,$a6,$acc6 + $LD $a6,$SIZE_T*7($np) + $LD $acc6,$SIZE_T*7($tp) + $ST $t3,$SIZE_T*4($rp) + subfe $t3,$a7,$acc7 + $LDU $a7,$SIZE_T*8($np) + $LDU $acc7,$SIZE_T*8($tp) + $ST $t0,$SIZE_T*5($rp) + subfe $t0,$a0,$acc0 + $ST $t1,$SIZE_T*6($rp) + subfe $t1,$a1,$acc1 + $ST $t2,$SIZE_T*7($rp) + $STU $t3,$SIZE_T*8($rp) + bdnz .Lsqr8x_sub + + srwi $cnt,$num,`log($SIZE_T)/log(2)+2` + $LD $a0,$SIZE_T*1($ap_end) # original $rp + $LD $acc0,$SIZE_T*1($n0) # original $tp + subi $cnt,$cnt,1 + $LD $a1,$SIZE_T*2($ap_end) + $LD $acc1,$SIZE_T*2($n0) + subfe $t2,$a2,$acc2 + $LD $a2,$SIZE_T*3($ap_end) + $LD $acc2,$SIZE_T*3($n0) + subfe $t3,$a3,$acc3 + $LD $a3,$SIZE_T*4($ap_end) + $LDU $acc3,$SIZE_T*4($n0) + $ST $t0,$SIZE_T*1($rp) + subfe $t0,$a4,$acc4 + $ST $t1,$SIZE_T*2($rp) + subfe $t1,$a5,$acc5 + $ST $t2,$SIZE_T*3($rp) + subfe $t2,$a6,$acc6 + $ST $t3,$SIZE_T*4($rp) + subfe $t3,$a7,$acc7 + $ST $t0,$SIZE_T*5($rp) + subfe $carry,$zero,$carry # did it borrow? + $ST $t1,$SIZE_T*6($rp) + $ST $t2,$SIZE_T*7($rp) + $ST $t3,$SIZE_T*8($rp) + + addi $tp,$sp,$SIZE_T*11 + mtctr $cnt + +.Lsqr4x_cond_copy: + andc $a0,$a0,$carry + $ST $zero,-$SIZE_T*3($n0) # wipe stack clean + and $acc0,$acc0,$carry + $ST $zero,-$SIZE_T*2($n0) + andc $a1,$a1,$carry + $ST $zero,-$SIZE_T*1($n0) + and $acc1,$acc1,$carry + $ST $zero,-$SIZE_T*0($n0) + andc $a2,$a2,$carry + $ST $zero,$SIZE_T*1($tp) + and $acc2,$acc2,$carry + $ST $zero,$SIZE_T*2($tp) + andc $a3,$a3,$carry + $ST $zero,$SIZE_T*3($tp) + and $acc3,$acc3,$carry + $STU $zero,$SIZE_T*4($tp) + or $t0,$a0,$acc0 + $LD $a0,$SIZE_T*5($ap_end) + $LD $acc0,$SIZE_T*1($n0) + or $t1,$a1,$acc1 + $LD $a1,$SIZE_T*6($ap_end) + $LD $acc1,$SIZE_T*2($n0) + or $t2,$a2,$acc2 + $LD $a2,$SIZE_T*7($ap_end) + $LD $acc2,$SIZE_T*3($n0) + or $t3,$a3,$acc3 + $LD $a3,$SIZE_T*8($ap_end) + $LDU $acc3,$SIZE_T*4($n0) + $ST $t0,$SIZE_T*1($ap_end) + $ST $t1,$SIZE_T*2($ap_end) + $ST $t2,$SIZE_T*3($ap_end) + $STU $t3,$SIZE_T*4($ap_end) + bdnz .Lsqr4x_cond_copy + + $POP $ap,0($sp) # pull saved sp + andc $a0,$a0,$carry + and $acc0,$acc0,$carry + andc $a1,$a1,$carry + and $acc1,$acc1,$carry + andc $a2,$a2,$carry + and $acc2,$acc2,$carry + andc $a3,$a3,$carry + and $acc3,$acc3,$carry + or $t0,$a0,$acc0 + or $t1,$a1,$acc1 + or $t2,$a2,$acc2 + or $t3,$a3,$acc3 + $ST $t0,$SIZE_T*1($ap_end) + $ST $t1,$SIZE_T*2($ap_end) + $ST $t2,$SIZE_T*3($ap_end) + $ST $t3,$SIZE_T*4($ap_end) + + b .Lsqr8x_done + +.align 5 +.Lsqr8x8_post_condition: + $POP $rp,$SIZE_T*6($sp) # pull rp + $POP $ap,0($sp) # pull saved sp + addze $carry,$zero + + # $acc0-7,$carry hold result, $a0-7 hold modulus + subfc $acc0,$a0,$acc0 + subfe $acc1,$a1,$acc1 + $ST $zero,$SIZE_T*12($sp) # wipe stack clean + $ST $zero,$SIZE_T*13($sp) + subfe $acc2,$a2,$acc2 + $ST $zero,$SIZE_T*14($sp) + $ST $zero,$SIZE_T*15($sp) + subfe $acc3,$a3,$acc3 + $ST $zero,$SIZE_T*16($sp) + $ST $zero,$SIZE_T*17($sp) + subfe $acc4,$a4,$acc4 + $ST $zero,$SIZE_T*18($sp) + $ST $zero,$SIZE_T*19($sp) + subfe $acc5,$a5,$acc5 + $ST $zero,$SIZE_T*20($sp) + $ST $zero,$SIZE_T*21($sp) + subfe $acc6,$a6,$acc6 + $ST $zero,$SIZE_T*22($sp) + $ST $zero,$SIZE_T*23($sp) + subfe $acc7,$a7,$acc7 + $ST $zero,$SIZE_T*24($sp) + $ST $zero,$SIZE_T*25($sp) + subfe $carry,$zero,$carry # did it borrow? + $ST $zero,$SIZE_T*26($sp) + $ST $zero,$SIZE_T*27($sp) + + and $a0,$a0,$carry + and $a1,$a1,$carry + addc $acc0,$acc0,$a0 # add modulus back if borrowed + and $a2,$a2,$carry + adde $acc1,$acc1,$a1 + and $a3,$a3,$carry + adde $acc2,$acc2,$a2 + and $a4,$a4,$carry + adde $acc3,$acc3,$a3 + and $a5,$a5,$carry + adde $acc4,$acc4,$a4 + and $a6,$a6,$carry + adde $acc5,$acc5,$a5 + and $a7,$a7,$carry + adde $acc6,$acc6,$a6 + adde $acc7,$acc7,$a7 + $ST $acc0,$SIZE_T*1($rp) + $ST $acc1,$SIZE_T*2($rp) + $ST $acc2,$SIZE_T*3($rp) + $ST $acc3,$SIZE_T*4($rp) + $ST $acc4,$SIZE_T*5($rp) + $ST $acc5,$SIZE_T*6($rp) + $ST $acc6,$SIZE_T*7($rp) + $ST $acc7,$SIZE_T*8($rp) + +.Lsqr8x_done: + $PUSH $zero,$SIZE_T*8($sp) + $PUSH $zero,$SIZE_T*10($sp) + + $POP r14,-$SIZE_T*18($ap) + li r3,1 # signal "done" + $POP r15,-$SIZE_T*17($ap) + $POP r16,-$SIZE_T*16($ap) + $POP r17,-$SIZE_T*15($ap) + $POP r18,-$SIZE_T*14($ap) + $POP r19,-$SIZE_T*13($ap) + $POP r20,-$SIZE_T*12($ap) + $POP r21,-$SIZE_T*11($ap) + $POP r22,-$SIZE_T*10($ap) + $POP r23,-$SIZE_T*9($ap) + $POP r24,-$SIZE_T*8($ap) + $POP r25,-$SIZE_T*7($ap) + $POP r26,-$SIZE_T*6($ap) + $POP r27,-$SIZE_T*5($ap) + $POP r28,-$SIZE_T*4($ap) + $POP r29,-$SIZE_T*3($ap) + $POP r30,-$SIZE_T*2($ap) + $POP r31,-$SIZE_T*1($ap) + mr $sp,$ap + blr + .long 0 + .byte 0,12,4,0x20,0x80,18,6,0 + .long 0 +.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +___ +} +$code.=<<___; .asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>" ___ diff --git a/crypto/bn/asm/ppc.pl b/crypto/bn/asm/ppc.pl index 446d8ba9492b..e37068192f2f 100644 --- a/crypto/bn/asm/ppc.pl +++ b/crypto/bn/asm/ppc.pl @@ -1,5 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved. # +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # Implemented as a Perl wrapper as we want to support several different # architectures with single file. We pick up the target based on the # file name we are asked to generate. @@ -32,9 +38,9 @@ #rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 #rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 #dsa 512 bits 0.0087s 0.0106s 114.3 94.5 -#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 +#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 # -# Same bechmark with this assembler code: +# Same benchmark with this assembler code: # #rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 #rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 @@ -68,7 +74,7 @@ #rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 #dsa 512 bits 0.0016s 0.0020s 610.7 507.1 #dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 -# +# # Again, performance increases by at about 75% # # Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) @@ -95,10 +101,7 @@ #dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 # # Performance increase of ~60% -# -# If you have comments or suggestions to improve code send -# me a note at schari@us.ibm.com -# +# Based on submission from Suresh N. Chari of IBM $flavour = shift; @@ -119,7 +122,7 @@ if ($flavour =~ /32/) { $CNTLZ= "cntlzw"; # count leading zeros $SHL= "slw"; # shift left $SHR= "srw"; # unsigned shift right - $SHRI= "srwi"; # unsigned shift right by immediate + $SHRI= "srwi"; # unsigned shift right by immediate $SHLI= "slwi"; # shift left by immediate $CLRU= "clrlwi"; # clear upper bits $INSR= "insrwi"; # insert right @@ -143,10 +146,10 @@ if ($flavour =~ /32/) { $CNTLZ= "cntlzd"; # count leading zeros $SHL= "sld"; # shift left $SHR= "srd"; # unsigned shift right - $SHRI= "srdi"; # unsigned shift right by immediate + $SHRI= "srdi"; # unsigned shift right by immediate $SHLI= "sldi"; # shift left by immediate $CLRU= "clrldi"; # clear upper bits - $INSR= "insrdi"; # insert right + $INSR= "insrdi"; # insert right $ROTL= "rotldi"; # rotate left by immediate $TR= "td"; # conditional trap } else { die "nonsense $flavour"; } @@ -183,7 +186,7 @@ $data=<<EOF; # below. # 12/05/03 Suresh Chari # (with lots of help from) Andy Polyakov -## +## # 1. Initial version 10/20/02 Suresh Chari # # @@ -196,7 +199,7 @@ $data=<<EOF; # be done in the build process. # # Hand optimized assembly code for the following routines -# +# # bn_sqr_comba4 # bn_sqr_comba8 # bn_mul_comba4 @@ -219,10 +222,10 @@ $data=<<EOF; #-------------------------------------------------------------------------- # # Defines to be used in the assembly code. -# +# #.set r0,0 # we use it as storage for value of 0 #.set SP,1 # preserved -#.set RTOC,2 # preserved +#.set RTOC,2 # preserved #.set r3,3 # 1st argument/return value #.set r4,4 # 2nd argument/volatile register #.set r5,5 # 3rd argument/volatile register @@ -240,7 +243,7 @@ $data=<<EOF; # the first . i.e. for example change ".bn_sqr_comba4" # to "bn_sqr_comba4". This should be automatically done # in the build. - + .globl .bn_sqr_comba4 .globl .bn_sqr_comba8 .globl .bn_mul_comba4 @@ -251,9 +254,9 @@ $data=<<EOF; .globl .bn_sqr_words .globl .bn_mul_words .globl .bn_mul_add_words - + # .text section - + .machine "any" # @@ -272,8 +275,8 @@ $data=<<EOF; # r3 contains r # r4 contains a # -# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: -# +# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: +# # r5,r6 are the two BN_ULONGs being multiplied. # r7,r8 are the results of the 32x32 giving 64 bit multiply. # r9,r10, r11 are the equivalents of c1,c2, c3. @@ -282,10 +285,10 @@ $data=<<EOF; # xor r0,r0,r0 # set r0 = 0. Used in the addze # instructions below - + #sqr_add_c(a,0,c1,c2,c3) - $LD r5,`0*$BNSZ`(r4) - $UMULL r9,r5,r5 + $LD r5,`0*$BNSZ`(r4) + $UMULL r9,r5,r5 $UMULH r10,r5,r5 #in first iteration. No need #to add since c1=c2=c3=0. # Note c3(r11) is NOT set to 0 @@ -293,20 +296,20 @@ $data=<<EOF; $ST r9,`0*$BNSZ`(r3) # r[0]=c1; # sqr_add_c2(a,1,0,c2,c3,c1); - $LD r6,`1*$BNSZ`(r4) + $LD r6,`1*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8) adde r8,r8,r8 addze r9,r0 # catch carry if any. - # r9= r0(=0) and carry - + # r9= r0(=0) and carry + addc r10,r7,r10 # now add to temp result. - addze r11,r8 # r8 added to r11 which is 0 + addze r11,r8 # r8 added to r11 which is 0 addze r9,r9 - - $ST r10,`1*$BNSZ`(r3) #r[1]=c2; + + $ST r10,`1*$BNSZ`(r3) #r[1]=c2; #sqr_add_c(a,1,c3,c1,c2) $UMULL r7,r6,r6 $UMULH r8,r6,r6 @@ -317,23 +320,23 @@ $data=<<EOF; $LD r6,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r7,r7,r7 adde r8,r8,r8 addze r10,r10 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 - $ST r11,`2*$BNSZ`(r3) #r[2]=c3 + $ST r11,`2*$BNSZ`(r3) #r[2]=c3 #sqr_add_c2(a,3,0,c1,c2,c3); - $LD r6,`3*$BNSZ`(r4) + $LD r6,`3*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r11,r0 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 @@ -342,7 +345,7 @@ $data=<<EOF; $LD r6,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r7,r7,r7 adde r8,r8,r8 addze r11,r11 @@ -357,31 +360,31 @@ $data=<<EOF; adde r11,r8,r11 addze r9,r0 #sqr_add_c2(a,3,1,c2,c3,c1); - $LD r6,`3*$BNSZ`(r4) + $LD r6,`3*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r9,r9 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 $ST r10,`4*$BNSZ`(r3) #r[4]=c2 #sqr_add_c2(a,3,2,c3,c1,c2); - $LD r5,`2*$BNSZ`(r4) + $LD r5,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 addc r7,r7,r7 adde r8,r8,r8 addze r10,r0 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 $ST r11,`5*$BNSZ`(r3) #r[5] = c3 #sqr_add_c(a,3,c1,c2,c3); - $UMULL r7,r6,r6 + $UMULL r7,r6,r6 $UMULH r8,r6,r6 addc r9,r7,r9 adde r10,r8,r10 @@ -400,7 +403,7 @@ $data=<<EOF; # for the gcc compiler. This should be automatically # done in the build # - + .align 4 .bn_sqr_comba8: # @@ -412,15 +415,15 @@ $data=<<EOF; # r3 contains r # r4 contains a # -# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: -# +# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows: +# # r5,r6 are the two BN_ULONGs being multiplied. # r7,r8 are the results of the 32x32 giving 64 bit multiply. # r9,r10, r11 are the equivalents of c1,c2, c3. # # Possible optimization of loading all 8 longs of a into registers -# doesnt provide any speedup -# +# doesn't provide any speedup +# xor r0,r0,r0 #set r0 = 0.Used in addze #instructions below. @@ -433,18 +436,18 @@ $data=<<EOF; #sqr_add_c2(a,1,0,c2,c3,c1); $LD r6,`1*$BNSZ`(r4) $UMULL r7,r5,r6 - $UMULH r8,r5,r6 - + $UMULH r8,r5,r6 + addc r10,r7,r10 #add the two register number adde r11,r8,r0 # (r8,r7) to the three register addze r9,r0 # number (r9,r11,r10).NOTE:r0=0 - + addc r10,r7,r10 #add the two register number adde r11,r8,r11 # (r8,r7) to the three register addze r9,r9 # number (r9,r11,r10). - + $ST r10,`1*$BNSZ`(r3) # r[1]=c2 - + #sqr_add_c(a,1,c3,c1,c2); $UMULL r7,r6,r6 $UMULH r8,r6,r6 @@ -455,25 +458,25 @@ $data=<<EOF; $LD r6,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 - + $ST r11,`2*$BNSZ`(r3) #r[2]=c3 #sqr_add_c2(a,3,0,c1,c2,c3); $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0]. $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r0 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 @@ -482,20 +485,20 @@ $data=<<EOF; $LD r6,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 - + $ST r9,`3*$BNSZ`(r3) #r[3]=c1; #sqr_add_c(a,2,c2,c3,c1); $UMULL r7,r6,r6 $UMULH r8,r6,r6 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 @@ -503,11 +506,11 @@ $data=<<EOF; $LD r6,`3*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 @@ -516,11 +519,11 @@ $data=<<EOF; $LD r6,`4*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 @@ -529,11 +532,11 @@ $data=<<EOF; $LD r6,`5*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r0 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 @@ -542,11 +545,11 @@ $data=<<EOF; $LD r6,`4*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 @@ -555,11 +558,11 @@ $data=<<EOF; $LD r6,`3*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 @@ -574,11 +577,11 @@ $data=<<EOF; $LD r6,`4*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 @@ -587,11 +590,11 @@ $data=<<EOF; $LD r6,`5*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r11 @@ -611,7 +614,7 @@ $data=<<EOF; $LD r6,`7*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r0 @@ -623,7 +626,7 @@ $data=<<EOF; $LD r6,`6*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 @@ -646,7 +649,7 @@ $data=<<EOF; $LD r6,`4*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r10,r7,r10 adde r11,r8,r11 addze r9,r9 @@ -678,7 +681,7 @@ $data=<<EOF; addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 - + addc r11,r7,r11 adde r9,r8,r9 addze r10,r10 @@ -698,7 +701,7 @@ $data=<<EOF; $LD r5,`2*$BNSZ`(r4) $UMULL r7,r5,r6 $UMULH r8,r5,r6 - + addc r9,r7,r9 adde r10,r8,r10 addze r11,r0 @@ -795,7 +798,7 @@ $data=<<EOF; adde r10,r8,r10 addze r11,r11 $ST r9,`12*$BNSZ`(r3) #r[12]=c1; - + #sqr_add_c2(a,7,6,c2,c3,c1) $LD r5,`6*$BNSZ`(r4) $UMULL r7,r5,r6 @@ -844,21 +847,21 @@ $data=<<EOF; # xor r0,r0,r0 #r0=0. Used in addze below. #mul_add_c(a[0],b[0],c1,c2,c3); - $LD r6,`0*$BNSZ`(r4) - $LD r7,`0*$BNSZ`(r5) - $UMULL r10,r6,r7 - $UMULH r11,r6,r7 + $LD r6,`0*$BNSZ`(r4) + $LD r7,`0*$BNSZ`(r5) + $UMULL r10,r6,r7 + $UMULH r11,r6,r7 $ST r10,`0*$BNSZ`(r3) #r[0]=c1 #mul_add_c(a[0],b[1],c2,c3,c1); - $LD r7,`1*$BNSZ`(r5) + $LD r7,`1*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r11,r8,r11 adde r12,r9,r0 addze r10,r0 #mul_add_c(a[1],b[0],c2,c3,c1); - $LD r6, `1*$BNSZ`(r4) - $LD r7, `0*$BNSZ`(r5) + $LD r6, `1*$BNSZ`(r4) + $LD r7, `0*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r11,r8,r11 @@ -866,23 +869,23 @@ $data=<<EOF; addze r10,r10 $ST r11,`1*$BNSZ`(r3) #r[1]=c2 #mul_add_c(a[2],b[0],c3,c1,c2); - $LD r6,`2*$BNSZ`(r4) + $LD r6,`2*$BNSZ`(r4) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r12,r8,r12 adde r10,r9,r10 addze r11,r0 #mul_add_c(a[1],b[1],c3,c1,c2); - $LD r6,`1*$BNSZ`(r4) - $LD r7,`1*$BNSZ`(r5) + $LD r6,`1*$BNSZ`(r4) + $LD r7,`1*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r12,r8,r12 adde r10,r9,r10 addze r11,r11 #mul_add_c(a[0],b[2],c3,c1,c2); - $LD r6,`0*$BNSZ`(r4) - $LD r7,`2*$BNSZ`(r5) + $LD r6,`0*$BNSZ`(r4) + $LD r7,`2*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r12,r8,r12 @@ -890,7 +893,7 @@ $data=<<EOF; addze r11,r11 $ST r12,`2*$BNSZ`(r3) #r[2]=c3 #mul_add_c(a[0],b[3],c1,c2,c3); - $LD r7,`3*$BNSZ`(r5) + $LD r7,`3*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r10,r8,r10 @@ -922,7 +925,7 @@ $data=<<EOF; addze r12,r12 $ST r10,`3*$BNSZ`(r3) #r[3]=c1 #mul_add_c(a[3],b[1],c2,c3,c1); - $LD r7,`1*$BNSZ`(r5) + $LD r7,`1*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r11,r8,r11 @@ -946,7 +949,7 @@ $data=<<EOF; addze r10,r10 $ST r11,`4*$BNSZ`(r3) #r[4]=c2 #mul_add_c(a[2],b[3],c3,c1,c2); - $LD r6,`2*$BNSZ`(r4) + $LD r6,`2*$BNSZ`(r4) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r12,r8,r12 @@ -962,7 +965,7 @@ $data=<<EOF; addze r11,r11 $ST r12,`5*$BNSZ`(r3) #r[5]=c3 #mul_add_c(a[3],b[3],c1,c2,c3); - $LD r7,`3*$BNSZ`(r5) + $LD r7,`3*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r10,r8,r10 @@ -982,7 +985,7 @@ $data=<<EOF; # for the gcc compiler. This should be automatically # done in the build # - + .align 4 .bn_mul_comba8: # @@ -997,7 +1000,7 @@ $data=<<EOF; # r10, r11, r12 are the equivalents of c1, c2, and c3. # xor r0,r0,r0 #r0=0. Used in addze below. - + #mul_add_c(a[0],b[0],c1,c2,c3); $LD r6,`0*$BNSZ`(r4) #a[0] $LD r7,`0*$BNSZ`(r5) #b[0] @@ -1009,7 +1012,7 @@ $data=<<EOF; $UMULL r8,r6,r7 $UMULH r9,r6,r7 addc r11,r11,r8 - addze r12,r9 # since we didnt set r12 to zero before. + addze r12,r9 # since we didn't set r12 to zero before. addze r10,r0 #mul_add_c(a[1],b[0],c2,c3,c1); $LD r6,`1*$BNSZ`(r4) @@ -1059,7 +1062,7 @@ $data=<<EOF; addc r10,r10,r8 adde r11,r11,r9 addze r12,r12 - + #mul_add_c(a[2],b[1],c1,c2,c3); $LD r6,`2*$BNSZ`(r4) $LD r7,`1*$BNSZ`(r5) @@ -1125,7 +1128,7 @@ $data=<<EOF; adde r10,r10,r9 addze r11,r0 #mul_add_c(a[1],b[4],c3,c1,c2); - $LD r6,`1*$BNSZ`(r4) + $LD r6,`1*$BNSZ`(r4) $LD r7,`4*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 @@ -1133,7 +1136,7 @@ $data=<<EOF; adde r10,r10,r9 addze r11,r11 #mul_add_c(a[2],b[3],c3,c1,c2); - $LD r6,`2*$BNSZ`(r4) + $LD r6,`2*$BNSZ`(r4) $LD r7,`3*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 @@ -1141,7 +1144,7 @@ $data=<<EOF; adde r10,r10,r9 addze r11,r11 #mul_add_c(a[3],b[2],c3,c1,c2); - $LD r6,`3*$BNSZ`(r4) + $LD r6,`3*$BNSZ`(r4) $LD r7,`2*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 @@ -1149,7 +1152,7 @@ $data=<<EOF; adde r10,r10,r9 addze r11,r11 #mul_add_c(a[4],b[1],c3,c1,c2); - $LD r6,`4*$BNSZ`(r4) + $LD r6,`4*$BNSZ`(r4) $LD r7,`1*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 @@ -1157,7 +1160,7 @@ $data=<<EOF; adde r10,r10,r9 addze r11,r11 #mul_add_c(a[5],b[0],c3,c1,c2); - $LD r6,`5*$BNSZ`(r4) + $LD r6,`5*$BNSZ`(r4) $LD r7,`0*$BNSZ`(r5) $UMULL r8,r6,r7 $UMULH r9,r6,r7 @@ -1549,7 +1552,7 @@ $data=<<EOF; addi r3,r3,-$BNSZ addi r5,r5,-$BNSZ mtctr r6 -Lppcasm_sub_mainloop: +Lppcasm_sub_mainloop: $LDU r7,$BNSZ(r4) $LDU r8,$BNSZ(r5) subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) @@ -1557,7 +1560,7 @@ Lppcasm_sub_mainloop: # is r7-r8 -1 as we need. $STU r6,$BNSZ(r3) bdnz Lppcasm_sub_mainloop -Lppcasm_sub_adios: +Lppcasm_sub_adios: subfze r3,r0 # if carry bit is set then r3 = 0 else -1 andi. r3,r3,1 # keep only last bit. blr @@ -1598,13 +1601,13 @@ Lppcasm_sub_adios: addi r3,r3,-$BNSZ addi r5,r5,-$BNSZ mtctr r6 -Lppcasm_add_mainloop: +Lppcasm_add_mainloop: $LDU r7,$BNSZ(r4) $LDU r8,$BNSZ(r5) adde r8,r7,r8 $STU r8,$BNSZ(r3) bdnz Lppcasm_add_mainloop -Lppcasm_add_adios: +Lppcasm_add_adios: addze r3,r0 #return carry bit. blr .long 0 @@ -1627,11 +1630,11 @@ Lppcasm_add_adios: # the PPC instruction to count leading zeros instead # of call to num_bits_word. Since this was compiled # only at level -O2 we can possibly squeeze it more? -# +# # r3 = h # r4 = l # r5 = d - + $UCMPI 0,r5,0 # compare r5 and 0 bne Lppcasm_div1 # proceed if d!=0 li r3,-1 # d=0 return -1 @@ -1647,7 +1650,7 @@ Lppcasm_div1: Lppcasm_div2: $UCMP 0,r3,r5 #h>=d? blt Lppcasm_div3 #goto Lppcasm_div3 if not - subf r3,r5,r3 #h-=d ; + subf r3,r5,r3 #h-=d ; Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i cmpi 0,0,r7,0 # is (i == 0)? beq Lppcasm_div4 @@ -1662,7 +1665,7 @@ Lppcasm_div4: # as it saves registers. li r6,2 #r6=2 mtctr r6 #counter will be in count. -Lppcasm_divouterloop: +Lppcasm_divouterloop: $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 # compute here for innerloop. @@ -1670,7 +1673,7 @@ Lppcasm_divouterloop: bne Lppcasm_div5 # goto Lppcasm_div5 if not li r8,-1 - $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l + $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l b Lppcasm_div6 Lppcasm_div5: $UDIV r8,r3,r9 #q = h/dh @@ -1678,7 +1681,7 @@ Lppcasm_div6: $UMULL r12,r9,r8 #th = q*dh $CLRU r10,r5,`$BITS/2` #r10=dl $UMULL r6,r8,r10 #tl = q*dl - + Lppcasm_divinnerloop: subf r10,r12,r3 #t = h -th $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... @@ -1755,7 +1758,7 @@ Lppcasm_div9: addi r4,r4,-$BNSZ addi r3,r3,-$BNSZ mtctr r5 -Lppcasm_sqr_mainloop: +Lppcasm_sqr_mainloop: #sqr(r[0],r[1],a[0]); $LDU r6,$BNSZ(r4) $UMULL r7,r6,r6 @@ -1763,7 +1766,7 @@ Lppcasm_sqr_mainloop: $STU r7,$BNSZ(r3) $STU r8,$BNSZ(r3) bdnz Lppcasm_sqr_mainloop -Lppcasm_sqr_adios: +Lppcasm_sqr_adios: blr .long 0 .byte 0,12,0x14,0,0,0,3,0 @@ -1777,7 +1780,7 @@ Lppcasm_sqr_adios: # done in the build # -.align 4 +.align 4 .bn_mul_words: # # BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) @@ -1791,7 +1794,7 @@ Lppcasm_sqr_adios: rlwinm. r7,r5,30,2,31 # num >> 2 beq Lppcasm_mw_REM mtctr r7 -Lppcasm_mw_LOOP: +Lppcasm_mw_LOOP: #mul(rp[0],ap[0],w,c1); $LD r8,`0*$BNSZ`(r4) $UMULL r9,r6,r8 @@ -1803,7 +1806,7 @@ Lppcasm_mw_LOOP: #using adde. $ST r9,`0*$BNSZ`(r3) #mul(rp[1],ap[1],w,c1); - $LD r8,`1*$BNSZ`(r4) + $LD r8,`1*$BNSZ`(r4) $UMULL r11,r6,r8 $UMULH r12,r6,r8 adde r11,r11,r10 @@ -1824,7 +1827,7 @@ Lppcasm_mw_LOOP: addze r12,r12 #this spin we collect carry into #r12 $ST r11,`3*$BNSZ`(r3) - + addi r3,r3,`4*$BNSZ` addi r4,r4,`4*$BNSZ` bdnz Lppcasm_mw_LOOP @@ -1840,25 +1843,25 @@ Lppcasm_mw_REM: addze r10,r10 $ST r9,`0*$BNSZ`(r3) addi r12,r10,0 - + addi r5,r5,-1 cmpli 0,0,r5,0 beq Lppcasm_mw_OVER - + #mul(rp[1],ap[1],w,c1); - $LD r8,`1*$BNSZ`(r4) + $LD r8,`1*$BNSZ`(r4) $UMULL r9,r6,r8 $UMULH r10,r6,r8 addc r9,r9,r12 addze r10,r10 $ST r9,`1*$BNSZ`(r3) addi r12,r10,0 - + addi r5,r5,-1 cmpli 0,0,r5,0 beq Lppcasm_mw_OVER - + #mul_add(rp[2],ap[2],w,c1); $LD r8,`2*$BNSZ`(r4) $UMULL r9,r6,r8 @@ -1867,14 +1870,14 @@ Lppcasm_mw_REM: addze r10,r10 $ST r9,`2*$BNSZ`(r3) addi r12,r10,0 - -Lppcasm_mw_OVER: + +Lppcasm_mw_OVER: addi r3,r12,0 blr .long 0 .byte 0,12,0x14,0,0,0,4,0 .long 0 -.size bn_mul_words,.-bn_mul_words +.size .bn_mul_words,.-.bn_mul_words # # NOTE: The following label name should be changed to @@ -1896,11 +1899,11 @@ Lppcasm_mw_OVER: # empirical evidence suggests that unrolled version performs best!! # xor r0,r0,r0 #r0 = 0 - xor r12,r12,r12 #r12 = 0 . used for carry + xor r12,r12,r12 #r12 = 0 . used for carry rlwinm. r7,r5,30,2,31 # num >> 2 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover mtctr r7 -Lppcasm_maw_mainloop: +Lppcasm_maw_mainloop: #mul_add(rp[0],ap[0],w,c1); $LD r8,`0*$BNSZ`(r4) $LD r11,`0*$BNSZ`(r3) @@ -1916,9 +1919,9 @@ Lppcasm_maw_mainloop: #by multiply and will be collected #in the next spin $ST r9,`0*$BNSZ`(r3) - + #mul_add(rp[1],ap[1],w,c1); - $LD r8,`1*$BNSZ`(r4) + $LD r8,`1*$BNSZ`(r4) $LD r9,`1*$BNSZ`(r3) $UMULL r11,r6,r8 $UMULH r12,r6,r8 @@ -1927,7 +1930,7 @@ Lppcasm_maw_mainloop: addc r11,r11,r9 #addze r12,r12 $ST r11,`1*$BNSZ`(r3) - + #mul_add(rp[2],ap[2],w,c1); $LD r8,`2*$BNSZ`(r4) $UMULL r9,r6,r8 @@ -1938,7 +1941,7 @@ Lppcasm_maw_mainloop: addc r9,r9,r11 #addze r10,r10 $ST r9,`2*$BNSZ`(r3) - + #mul_add(rp[3],ap[3],w,c1); $LD r8,`3*$BNSZ`(r4) $UMULL r11,r6,r8 @@ -1952,7 +1955,7 @@ Lppcasm_maw_mainloop: addi r3,r3,`4*$BNSZ` addi r4,r4,`4*$BNSZ` bdnz Lppcasm_maw_mainloop - + Lppcasm_maw_leftover: andi. r5,r5,0x3 beq Lppcasm_maw_adios @@ -1969,10 +1972,10 @@ Lppcasm_maw_leftover: addc r9,r9,r12 addze r12,r10 $ST r9,0(r3) - + bdz Lppcasm_maw_adios #mul_add(rp[1],ap[1],w,c1); - $LDU r8,$BNSZ(r4) + $LDU r8,$BNSZ(r4) $UMULL r9,r6,r8 $UMULH r10,r6,r8 $LDU r11,$BNSZ(r3) @@ -1981,7 +1984,7 @@ Lppcasm_maw_leftover: addc r9,r9,r12 addze r12,r10 $ST r9,0(r3) - + bdz Lppcasm_maw_adios #mul_add(rp[2],ap[2],w,c1); $LDU r8,$BNSZ(r4) @@ -1993,8 +1996,8 @@ Lppcasm_maw_leftover: addc r9,r9,r12 addze r12,r10 $ST r9,0(r3) - -Lppcasm_maw_adios: + +Lppcasm_maw_adios: addi r3,r12,0 blr .long 0 diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl index 595fc6d31f60..c41b620bc23e 100755 --- a/crypto/bn/asm/ppc64-mont.pl +++ b/crypto/bn/asm/ppc64-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -28,7 +35,7 @@ # key lengths. As it's obviously inappropriate as "best all-round" # alternative, it has to be complemented with run-time CPU family # detection. Oh! It should also be noted that unlike other PowerPC -# implementation IALU ppc-mont.pl module performs *suboptimaly* on +# implementation IALU ppc-mont.pl module performs *suboptimally* on # >=1024-bit key lengths on Power 6. It should also be noted that # *everything* said so far applies to 64-bit builds! As far as 32-bit # application executed on 64-bit CPU goes, this module is likely to @@ -1346,7 +1353,7 @@ $code.=<<___; std $t3,-16($tp) ; tp[j-1] std $t5,-8($tp) ; tp[j] - add $carry,$carry,$ovf ; comsume upmost overflow + add $carry,$carry,$ovf ; consume upmost overflow add $t6,$t6,$carry ; can not overflow srdi $carry,$t6,16 add $t7,$t7,$carry @@ -1494,16 +1501,14 @@ Lsub: ldx $t0,$tp,$i li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp - addi $t7,$ap,8 mtctr $j .align 4 -Lcopy: ; copy or in-place refresh - ldx $t0,$ap,$i - ldx $t1,$t7,$i +Lcopy: ; conditional copy + ldx $t0,$tp,$i + ldx $t1,$t4,$i + ldx $t2,$rp,$i + ldx $t3,$t6,$i std $i,8($nap_d) ; zap nap_d std $i,16($nap_d) std $i,24($nap_d) @@ -1512,6 +1517,12 @@ Lcopy: ; copy or in-place refresh std $i,48($nap_d) std $i,56($nap_d) stdu $i,64($nap_d) + and $t0,$t0,$ovf + and $t1,$t1,$ovf + andc $t2,$t2,$ovf + andc $t3,$t3,$ovf + or $t0,$t0,$t2 + or $t1,$t1,$t3 stdx $t0,$rp,$i stdx $t1,$t6,$i stdx $i,$tp,$i ; zap tp at once @@ -1554,20 +1565,21 @@ Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order li $i,0 subfe $ovf,$i,$ovf ; handle upmost overflow bit - addi $tp,$sp,`$FRAME+$TRANSFER+4` + addi $ap,$sp,`$FRAME+$TRANSFER+4` subf $rp,$num,$rp ; rewind rp - and $ap,$tp,$ovf - andc $np,$rp,$ovf - or $ap,$ap,$np ; ap=borrow?tp:rp addi $tp,$sp,`$FRAME+$TRANSFER` mtctr $j .align 4 -Lcopy: ; copy or in-place refresh +Lcopy: ; conditional copy lwz $t0,4($ap) lwz $t1,8($ap) lwz $t2,12($ap) lwzu $t3,16($ap) + lwz $t4,4($rp) + lwz $t5,8($rp) + lwz $t6,12($rp) + lwz $t7,16($rp) std $i,8($nap_d) ; zap nap_d std $i,16($nap_d) std $i,24($nap_d) @@ -1576,6 +1588,18 @@ Lcopy: ; copy or in-place refresh std $i,48($nap_d) std $i,56($nap_d) stdu $i,64($nap_d) + and $t0,$t0,$ovf + and $t1,$t1,$ovf + and $t2,$t2,$ovf + and $t3,$t3,$ovf + andc $t4,$t4,$ovf + andc $t5,$t5,$ovf + andc $t6,$t6,$ovf + andc $t7,$t7,$ovf + or $t0,$t0,$t4 + or $t1,$t1,$t5 + or $t2,$t2,$t6 + or $t3,$t3,$t7 stw $t0,4($rp) stw $t1,8($rp) stw $t2,12($rp) diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl index 2b3f8b0e21ec..f1292cc75cfb 100755 --- a/crypto/bn/asm/rsaz-avx2.pl +++ b/crypto/bn/asm/rsaz-avx2.pl @@ -1,61 +1,30 @@ -#!/usr/bin/env perl - -############################################################################## -# # -# Copyright (c) 2012, Intel Corporation # -# # -# All rights reserved. # -# # -# Redistribution and use in source and binary forms, with or without # -# modification, are permitted provided that the following conditions are # -# met: # -# # -# * Redistributions of source code must retain the above copyright # -# notice, this list of conditions and the following disclaimer. # -# # -# * Redistributions in binary form must reproduce the above copyright # -# notice, this list of conditions and the following disclaimer in the # -# documentation and/or other materials provided with the # -# distribution. # -# # -# * Neither the name of the Intel Corporation nor the names of its # -# contributors may be used to endorse or promote products derived from # -# this software without specific prior written permission. # -# # -# # -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# # -############################################################################## -# Developers and authors: # -# Shay Gueron (1, 2), and Vlad Krasnov (1) # -# (1) Intel Corporation, Israel Development Center, Haifa, Israel # -# (2) University of Haifa, Israel # -############################################################################## -# Reference: # -# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # -# Exponentiation, Using Advanced Vector Instructions Architectures", # -# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # -# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # -# [2] S. Gueron: "Efficient Software Implementations of Modular # -# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # -# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # -# Proceedings of 9th International Conference on Information Technology: # -# New Generations (ITNG 2012), pp.821-823 (2012) # -# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # -# resistant 1024-bit modular exponentiation, for optimizing RSA2048 # -# on AVX2 capable x86_64 platforms", # -# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# -############################################################################## +#! /usr/bin/env perl +# Copyright 2013-2018 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2012, Intel Corporation. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# +# References: +# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular +# Exponentiation, Using Advanced Vector Instructions Architectures", +# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, +# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 +# [2] S. Gueron: "Efficient Software Implementations of Modular +# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). +# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE +# Proceedings of 9th International Conference on Information Technology: +# New Generations (ITNG 2012), pp.821-823 (2012) +# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis +# resistant 1024-bit modular exponentiation, for optimizing RSA2048 +# on AVX2 capable x86_64 platforms", +# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest # # +13% improvement over original submission by <appro@openssl.org> # @@ -97,13 +66,13 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT = *OUT; if ($avx>1) {{{ @@ -161,13 +130,21 @@ $code.=<<___; .type rsaz_1024_sqr_avx2,\@function,5 .align 64 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 +.cfi_startproc lea (%rsp), %rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 vzeroupper ___ $code.=<<___ if ($win64); @@ -186,6 +163,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rax,%rbp +.cfi_def_cfa_register %rbp mov %rdx, $np # reassigned argument sub \$$FrameSize, %rsp mov $np, $tmp @@ -375,7 +353,7 @@ $code.=<<___; vpaddq $TEMP1, $ACC1, $ACC1 vpmuludq 32*7-128($aap), $B2, $ACC2 vpbroadcastq 32*5-128($tpa), $B2 - vpaddq 32*11-448($tp1), $ACC2, $ACC2 + vpaddq 32*11-448($tp1), $ACC2, $ACC2 vmovdqu $ACC6, 32*6-192($tp0) vmovdqu $ACC7, 32*7-192($tp0) @@ -434,7 +412,7 @@ $code.=<<___; vmovdqu $ACC7, 32*16-448($tp1) lea 8($tp1), $tp1 - dec $i + dec $i jnz .LOOP_SQR_1024 ___ $ZERO = $ACC9; @@ -779,7 +757,7 @@ $code.=<<___; vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 - + vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 @@ -818,8 +796,10 @@ $code.=<<___; vzeroall mov %rbp, %rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); +.Lsqr_1024_in_tail: movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 @@ -833,14 +813,22 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lsqr_1024_epilogue: ret +.cfi_endproc .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 ___ } @@ -893,13 +881,21 @@ $code.=<<___; .type rsaz_1024_mul_avx2,\@function,5 .align 64 rsaz_1024_mul_avx2: +.cfi_startproc lea (%rsp), %rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 ___ $code.=<<___ if ($win64); vzeroupper @@ -918,6 +914,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %rax,%rbp +.cfi_def_cfa_register %rbp vzeroall mov %rdx, $bp # reassigned argument sub \$64,%rsp @@ -1443,15 +1440,17 @@ $code.=<<___; vpaddq $TEMP4, $ACC8, $ACC8 vmovdqu $ACC4, 128-128($rp) - vmovdqu $ACC5, 160-128($rp) + vmovdqu $ACC5, 160-128($rp) vmovdqu $ACC6, 192-128($rp) vmovdqu $ACC7, 224-128($rp) vmovdqu $ACC8, 256-128($rp) vzeroupper mov %rbp, %rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); +.Lmul_1024_in_tail: movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 @@ -1465,14 +1464,22 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov -48(%rax),%r15 +.cfi_restore %r15 mov -40(%rax),%r14 +.cfi_restore %r14 mov -32(%rax),%r13 +.cfi_restore %r13 mov -24(%rax),%r12 +.cfi_restore %r12 mov -16(%rax),%rbp +.cfi_restore %rbp mov -8(%rax),%rbx +.cfi_restore %rbx lea (%rax),%rsp # restore %rsp +.cfi_def_cfa_register %rsp .Lmul_1024_epilogue: ret +.cfi_endproc .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 ___ } @@ -1591,8 +1598,10 @@ rsaz_1024_scatter5_avx2: .type rsaz_1024_gather5_avx2,\@abi-omnipotent .align 32 rsaz_1024_gather5_avx2: +.cfi_startproc vzeroupper mov %rsp,%r11 +.cfi_def_cfa_register %r11 ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax @@ -1730,11 +1739,13 @@ $code.=<<___ if ($win64); movaps -0x38(%r11),%xmm13 movaps -0x28(%r11),%xmm14 movaps -0x18(%r11),%xmm15 -.LSEH_end_rsaz_1024_gather5: ___ $code.=<<___; lea (%r11),%rsp +.cfi_def_cfa_register %rsp ret +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ } @@ -1807,14 +1818,17 @@ rsaz_se_handler: cmp %r10,%rbx # context->Rip<prologue label jb .Lcommon_seh_tail - mov 152($context),%rax # pull context->Rsp - mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 160($context),%rax # pull context->Rbp + mov 160($context),%rbp # pull context->Rbp + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # "in tail" label + cmp %r10,%rbx # context->Rip>="in tail" label + cmovc %rbp,%rax mov -48(%rax),%r15 mov -40(%rax),%r14 @@ -1892,11 +1906,13 @@ rsaz_se_handler: .LSEH_info_rsaz_1024_sqr_avx2: .byte 9,0,0,0 .rva rsaz_se_handler - .rva .Lsqr_1024_body,.Lsqr_1024_epilogue + .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail + .long 0 .LSEH_info_rsaz_1024_mul_avx2: .byte 9,0,0,0 .rva rsaz_se_handler - .rva .Lmul_1024_body,.Lmul_1024_epilogue + .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail + .long 0 .LSEH_info_rsaz_1024_gather5: .byte 0x01,0x36,0x17,0x0b .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl index 87ce2c34d90c..b1797b649f00 100755 --- a/crypto/bn/asm/rsaz-x86_64.pl +++ b/crypto/bn/asm/rsaz-x86_64.pl @@ -1,61 +1,29 @@ -#!/usr/bin/env perl - -############################################################################## -# # -# Copyright (c) 2012, Intel Corporation # -# # -# All rights reserved. # -# # -# Redistribution and use in source and binary forms, with or without # -# modification, are permitted provided that the following conditions are # -# met: # -# # -# * Redistributions of source code must retain the above copyright # -# notice, this list of conditions and the following disclaimer. # -# # -# * Redistributions in binary form must reproduce the above copyright # -# notice, this list of conditions and the following disclaimer in the # -# documentation and/or other materials provided with the # -# distribution. # -# # -# * Neither the name of the Intel Corporation nor the names of its # -# contributors may be used to endorse or promote products derived from # -# this software without specific prior written permission. # -# # -# # -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # -# # -############################################################################## -# Developers and authors: # -# Shay Gueron (1, 2), and Vlad Krasnov (1) # -# (1) Intel Architecture Group, Microprocessor and Chipset Development, # -# Israel Development Center, Haifa, Israel # -# (2) University of Haifa # -############################################################################## -# Reference: # -# [1] S. Gueron, "Efficient Software Implementations of Modular # -# Exponentiation", http://eprint.iacr.org/2011/239 # -# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # -# IEEE Proceedings of 9th International Conference on Information # -# Technology: New Generations (ITNG 2012), 821-823 (2012). # -# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# -# Journal of Cryptographic Engineering 2:31-43 (2012). # -# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # -# resistant 512-bit and 1024-bit modular exponentiation for optimizing # -# RSA1024 and RSA2048 on x86_64 platforms", # -# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# -############################################################################## - +#! /usr/bin/env perl +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright (c) 2012, Intel Corporation. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +# (1) Intel Corporation, Israel Development Center, Haifa, Israel +# (2) University of Haifa, Israel +# +# References: +# [1] S. Gueron, "Efficient Software Implementations of Modular +# Exponentiation", http://eprint.iacr.org/2011/239 +# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". +# IEEE Proceedings of 9th International Conference on Information +# Technology: New Generations (ITNG 2012), 821-823 (2012). +# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation +# Journal of Cryptographic Engineering 2:31-43 (2012). +# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis +# resistant 512-bit and 1024-bit modular exponentiation for optimizing +# RSA1024 and RSA2048 on x86_64 platforms", +# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest +# # While original submission covers 512- and 1024-bit exponentiation, # this module is limited to 512-bit version only (and as such # accelerates RSA1024 sign). This is because improvement for longer @@ -95,7 +63,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -131,14 +99,22 @@ $code.=<<___; .type rsaz_512_sqr,\@function,5 .align 32 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 subq \$128+24, %rsp +.cfi_adjust_cfa_offset 128+24 .Lsqr_body: movq $mod, %rbp # common argument movq ($inp), %rdx @@ -275,9 +251,9 @@ $code.=<<___; movq %r9, 16(%rsp) movq %r10, 24(%rsp) shrq \$63, %rbx - + #third iteration - movq 16($inp), %r9 + movq 16($inp), %r9 movq 24($inp), %rax mulq %r9 addq %rax, %r12 @@ -525,7 +501,7 @@ $code.=<<___; movl $times,128+8(%rsp) movq $out, %xmm0 # off-load movq %rbp, %xmm1 # off-load -#first iteration +#first iteration mulx %rax, %r8, %r9 mulx 16($inp), %rcx, %r10 @@ -561,7 +537,7 @@ $code.=<<___; mov %rax, (%rsp) mov %r8, 8(%rsp) -#second iteration +#second iteration mulx 16($inp), %rax, %rbx adox %rax, %r10 adcx %rbx, %r11 @@ -600,8 +576,8 @@ $code.=<<___; mov %r9, 16(%rsp) .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) - -#third iteration + +#third iteration .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 adox $out, %r12 adcx %r9, %r13 @@ -636,8 +612,8 @@ $code.=<<___; mov %r11, 32(%rsp) .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) - -#fourth iteration + +#fourth iteration .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx adox %rax, %r14 adcx %rbx, %r15 @@ -669,8 +645,8 @@ $code.=<<___; mov %r13, 48(%rsp) mov %r14, 56(%rsp) - -#fifth iteration + +#fifth iteration .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 adox $out, %r8 adcx %r11, %r9 @@ -697,8 +673,8 @@ $code.=<<___; mov %r15, 64(%rsp) mov %r8, 72(%rsp) - -#sixth iteration + +#sixth iteration .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx adox %rax, %r10 adcx %rbx, %r11 @@ -793,15 +769,24 @@ ___ $code.=<<___; leaq 128+24+48(%rsp), %rax +.cfi_def_cfa %rax,8 movq -48(%rax), %r15 +.cfi_restore %r15 movq -40(%rax), %r14 +.cfi_restore %r14 movq -32(%rax), %r13 +.cfi_restore %r13 movq -24(%rax), %r12 +.cfi_restore %r12 movq -16(%rax), %rbp +.cfi_restore %rbp movq -8(%rax), %rbx +.cfi_restore %rbx leaq (%rax), %rsp +.cfi_def_cfa_register %rsp .Lsqr_epilogue: ret +.cfi_endproc .size rsaz_512_sqr,.-rsaz_512_sqr ___ } @@ -812,14 +797,22 @@ $code.=<<___; .type rsaz_512_mul,\@function,5 .align 32 rsaz_512_mul: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 subq \$128+24, %rsp +.cfi_adjust_cfa_offset 128+24 .Lmul_body: movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 @@ -889,15 +882,24 @@ $code.=<<___; call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax +.cfi_def_cfa %rax,8 movq -48(%rax), %r15 +.cfi_restore %r15 movq -40(%rax), %r14 +.cfi_restore %r14 movq -32(%rax), %r13 +.cfi_restore %r13 movq -24(%rax), %r12 +.cfi_restore %r12 movq -16(%rax), %rbp +.cfi_restore %rbp movq -8(%rax), %rbx +.cfi_restore %rbx leaq (%rax), %rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size rsaz_512_mul,.-rsaz_512_mul ___ } @@ -908,14 +910,22 @@ $code.=<<___; .type rsaz_512_mul_gather4,\@function,6 .align 32 rsaz_512_mul_gather4: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 subq \$`128+24+($win64?0xb0:0)`, %rsp +.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,0xa0(%rsp) @@ -1041,7 +1051,7 @@ $code.=<<___; movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 - + mulq %rbx addq %rax, %r14 movq ($ap), %rax @@ -1143,7 +1153,7 @@ $code.=<<___; movq ($ap), %rax adcq \$0, %rdx addq %r15, %r14 - movq %rdx, %r15 + movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rdi), %rdi @@ -1205,7 +1215,7 @@ $code.=<<___ if ($addx); mulx 48($ap), %rbx, %r14 adcx %rax, %r12 - + mulx 56($ap), %rax, %r15 adcx %rbx, %r13 adcx %rax, %r14 @@ -1341,15 +1351,24 @@ $code.=<<___ if ($win64); lea 0xb0(%rax),%rax ___ $code.=<<___; +.cfi_def_cfa %rax,8 movq -48(%rax), %r15 +.cfi_restore %r15 movq -40(%rax), %r14 +.cfi_restore %r14 movq -32(%rax), %r13 +.cfi_restore %r13 movq -24(%rax), %r12 +.cfi_restore %r12 movq -16(%rax), %rbp +.cfi_restore %rbp movq -8(%rax), %rbx +.cfi_restore %rbx leaq (%rax), %rsp +.cfi_def_cfa_register %rsp .Lmul_gather4_epilogue: ret +.cfi_endproc .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 ___ } @@ -1360,15 +1379,23 @@ $code.=<<___; .type rsaz_512_mul_scatter4,\@function,6 .align 32 rsaz_512_mul_scatter4: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 mov $pwr, $pwr subq \$128+24, %rsp +.cfi_adjust_cfa_offset 128+24 .Lmul_scatter4_body: leaq ($tbl,$pwr,8), $tbl movq $out, %xmm0 # off-load arguments @@ -1404,7 +1431,7 @@ $code.=<<___; ___ $code.=<<___ if ($addx); jmp .Lmul_scatter_tail - + .align 32 .Lmulx_scatter: movq ($out), %rdx # pass b[0] @@ -1451,15 +1478,24 @@ $code.=<<___; movq %r15, 128*7($inp) leaq 128+24+48(%rsp), %rax +.cfi_def_cfa %rax,8 movq -48(%rax), %r15 +.cfi_restore %r15 movq -40(%rax), %r14 +.cfi_restore %r14 movq -32(%rax), %r13 +.cfi_restore %r13 movq -24(%rax), %r12 +.cfi_restore %r12 movq -16(%rax), %rbp +.cfi_restore %rbp movq -8(%rax), %rbx +.cfi_restore %rbx leaq (%rax), %rsp +.cfi_def_cfa_register %rsp .Lmul_scatter4_epilogue: ret +.cfi_endproc .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 ___ } @@ -1470,14 +1506,22 @@ $code.=<<___; .type rsaz_512_mul_by_one,\@function,4 .align 32 rsaz_512_mul_by_one: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 subq \$128+24, %rsp +.cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: ___ $code.=<<___ if ($addx); @@ -1532,15 +1576,24 @@ $code.=<<___; movq %r15, 56($out) leaq 128+24+48(%rsp), %rax +.cfi_def_cfa %rax,8 movq -48(%rax), %r15 +.cfi_restore %r15 movq -40(%rax), %r14 +.cfi_restore %r14 movq -32(%rax), %r13 +.cfi_restore %r13 movq -24(%rax), %r12 +.cfi_restore %r12 movq -16(%rax), %rbp +.cfi_restore %rbp movq -8(%rax), %rbx +.cfi_restore %rbx leaq (%rax), %rsp +.cfi_def_cfa_register %rsp .Lmul_by_one_epilogue: ret +.cfi_endproc .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one ___ } @@ -1767,7 +1820,7 @@ ___ { # __rsaz_512_mul # # input: %rsi - ap, %rbp - bp - # ouput: + # output: # clobbers: everything my ($ap,$bp) = ("%rsi","%rbp"); $code.=<<___; @@ -1817,7 +1870,7 @@ __rsaz_512_mul: movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 - + mulq %rbx addq %rax, %r14 movq ($ap), %rax @@ -1894,7 +1947,7 @@ __rsaz_512_mul: movq ($ap), %rax adcq \$0, %rdx addq %r15, %r14 - movq %rdx, %r15 + movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rdi), %rdi @@ -1919,7 +1972,7 @@ if ($addx) { # __rsaz_512_mulx # # input: %rsi - ap, %rbp - bp - # ouput: + # output: # clobbers: everything my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); $code.=<<___; diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl index 9d18d40e7784..06181bf9b95f 100755 --- a/crypto/bn/asm/s390x-gf2m.pl +++ b/crypto/bn/asm/s390x-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -13,7 +20,7 @@ # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for # the time being... gcc 4.3 appeared to generate poor code, therefore # the effort. And indeed, the module delivers 55%-90%(*) improvement -# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit +# on heaviest ECDSA verify and ECDH benchmarks for 163- and 571-bit # key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. # This is for 64-bit build. In 32-bit "highgprs" case improvement is # even higher, for example on z990 it was measured 80%-150%. ECDSA @@ -35,7 +42,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; @@ -191,7 +198,7 @@ $code.=<<___; xgr $hi,@r[1] xgr $lo,@r[0] xgr $hi,@r[2] - xgr $lo,@r[3] + xgr $lo,@r[3] xgr $hi,@r[3] xgr $lo,$hi stg $hi,16($rp) diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl index 9fd64e81eef3..c2fc5adffe0d 100755 --- a/crypto/bn/asm/s390x-mont.pl +++ b/crypto/bn/asm/s390x-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -54,7 +61,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $stdframe=16*$SIZE_T+4*8; @@ -245,16 +252,16 @@ $code.=<<___; brct $count,.Lsub lghi $ahi,0 slbgr $AHI,$ahi # handle upmost carry - - ngr $ap,$AHI - lghi $np,-1 - xgr $np,$AHI - ngr $np,$rp - ogr $ap,$np # ap=borrow?tp:rp + lghi $NHI,-1 + xgr $NHI,$AHI la $j,0(%r0) lgr $count,$num -.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh +.Lcopy: lg $ahi,$stdframe($j,$sp) # conditional copy + lg $alo,0($j,$rp) + ngr $ahi,$AHI + ngr $alo,$NHI + ogr $alo,$ahi _dswap $alo stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S index f5eebe413a28..292a7a9998bd 100755 --- a/crypto/bn/asm/s390x.S +++ b/crypto/bn/asm/s390x.S @@ -1,11 +1,11 @@ .ident "s390x.S, version 1.1" // ==================================================================== -// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL -// project. +// Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. // -// Rights for redistribution and usage in source and binary forms are -// granted according to the OpenSSL license. Warranty of any kind is -// disclaimed. +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html // ==================================================================== .text diff --git a/crypto/bn/asm/sparct4-mont.pl b/crypto/bn/asm/sparct4-mont.pl index 71b45002a42f..fcae9cfc5b44 100755 --- a/crypto/bn/asm/sparct4-mont.pl +++ b/crypto/bn/asm/sparct4-mont.pl @@ -1,9 +1,16 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov -# <appro@openssl.org>. The module is licensed under 2-clause BSD -# license. November 2012. All rights reserved. +# Written by David S. Miller and Andy Polyakov +# The module is licensed under 2-clause BSD license. +# November 2012. All rights reserved. # ==================================================================== ###################################################################### @@ -76,6 +83,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "sparcv9_modes.pl"; +$output = pop; +open STDOUT,">$output"; + $code.=<<___; #include "sparc_arch.h" @@ -878,19 +888,17 @@ $code.=<<___; sub $tp, $num, $tp sub $rp, $num, $rp - subc $ovf, %g0, $ovf ! handle upmost overflow bit - and $tp, $ovf, $ap - andn $rp, $ovf, $np - or $np, $ap, $ap ! ap=borrow?tp:rp + subccc $ovf, %g0, $ovf ! handle upmost overflow bit ba .Lcopy sub $num, 8, $cnt .align 16 -.Lcopy: ! copy or in-place refresh - ldx [$ap+0], $t2 - add $ap, 8, $ap +.Lcopy: ! conditional copy + ldx [$tp], $tj + ldx [$rp+0], $t2 stx %g0, [$tp] ! zap add $tp, 8, $tp + movcs %icc, $tj, $t2 stx $t2, [$rp+0] add $rp, 8, $rp brnz $cnt, .Lcopy @@ -1126,19 +1134,17 @@ $code.=<<___; sub $tp, $num, $tp sub $rp, $num, $rp - subc $ovf, %g0, $ovf ! handle upmost overflow bit - and $tp, $ovf, $ap - andn $rp, $ovf, $np - or $np, $ap, $ap ! ap=borrow?tp:rp + subccc $ovf, %g0, $ovf ! handle upmost overflow bit ba .Lcopy_g5 sub $num, 8, $cnt .align 16 -.Lcopy_g5: ! copy or in-place refresh - ldx [$ap+0], $t2 - add $ap, 8, $ap +.Lcopy_g5: ! conditional copy + ldx [$tp], $tj + ldx [$rp+0], $t2 stx %g0, [$tp] ! zap add $tp, 8, $tp + movcs %icc, $tj, $t2 stx $t2, [$rp+0] add $rp, 8, $rp brnz $cnt, .Lcopy_g5 diff --git a/crypto/bn/asm/sparcv8.S b/crypto/bn/asm/sparcv8.S index 88c5dc480a76..75d72eb92c74 100644 --- a/crypto/bn/asm/sparcv8.S +++ b/crypto/bn/asm/sparcv8.S @@ -1,19 +1,19 @@ .ident "sparcv8.s, Version 1.4" -.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" +.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@openssl.org>" /* * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. + * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html * ==================================================================== */ /* - * This is my modest contributon to OpenSSL project (see + * This is my modest contribution to OpenSSL project (see * http://www.openssl.org/ for more information about it) and is * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c * module. For updates see http://fy.chalmers.se/~appro/hpe/. @@ -159,12 +159,12 @@ bn_mul_add_words: */ bn_mul_words: cmp %o2,0 - bg,a .L_bn_mul_words_proceeed + bg,a .L_bn_mul_words_proceed ld [%o1],%g2 retl clr %o0 -.L_bn_mul_words_proceeed: +.L_bn_mul_words_proceed: andcc %o2,-4,%g0 bz .L_bn_mul_words_tail clr %o5 @@ -251,12 +251,12 @@ bn_mul_words: */ bn_sqr_words: cmp %o2,0 - bg,a .L_bn_sqr_words_proceeed + bg,a .L_bn_sqr_words_proceed ld [%o1],%g2 retl clr %o0 -.L_bn_sqr_words_proceeed: +.L_bn_sqr_words_proceed: andcc %o2,-4,%g0 bz .L_bn_sqr_words_tail clr %o5 diff --git a/crypto/bn/asm/sparcv8plus.S b/crypto/bn/asm/sparcv8plus.S index 63de1860f285..fe4699b2bdd1 100644 --- a/crypto/bn/asm/sparcv8plus.S +++ b/crypto/bn/asm/sparcv8plus.S @@ -1,19 +1,19 @@ .ident "sparcv8plus.s, Version 1.4" -.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" +.ident "SPARC v9 ISA artwork by Andy Polyakov <appro@openssl.org>" /* * ==================================================================== - * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL - * project. + * Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html * ==================================================================== */ /* - * This is my modest contributon to OpenSSL project (see + * This is my modest contribution to OpenSSL project (see * http://www.openssl.org/ for more information about it) and is * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c * module. For updates see http://fy.chalmers.se/~appro/hpe/. @@ -52,7 +52,7 @@ * # cd ../.. * # make; make test * - * Q. V8plus achitecture? What kind of beast is that? + * Q. V8plus architecture? What kind of beast is that? * A. Well, it's rather a programming model than an architecture... * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under * special conditions, namely when kernel doesn't preserve upper @@ -71,7 +71,7 @@ * * Q. 64-bit registers under 32-bit kernels? Didn't you just say it * doesn't work? - * A. You can't adress *all* registers as 64-bit wide:-( The catch is + * A. You can't address *all* registers as 64-bit wide:-( The catch is * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully * preserved if you're in a leaf function, i.e. such never calling * any other functions. All functions in this module are leaf and @@ -278,7 +278,7 @@ bn_mul_add_words: */ bn_mul_words: sra %o2,%g0,%o2 ! signx %o2 - brgz,a %o2,.L_bn_mul_words_proceeed + brgz,a %o2,.L_bn_mul_words_proceed lduw [%o1],%g2 retl clr %o0 @@ -286,7 +286,7 @@ bn_mul_words: nop nop -.L_bn_mul_words_proceeed: +.L_bn_mul_words_proceed: srl %o3,%g0,%o3 ! clruw %o3 andcc %o2,-4,%g0 bz,pn %icc,.L_bn_mul_words_tail @@ -366,7 +366,7 @@ bn_mul_words: */ bn_sqr_words: sra %o2,%g0,%o2 ! signx %o2 - brgz,a %o2,.L_bn_sqr_words_proceeed + brgz,a %o2,.L_bn_sqr_words_proceed lduw [%o1],%g2 retl clr %o0 @@ -374,7 +374,7 @@ bn_sqr_words: nop nop -.L_bn_sqr_words_proceeed: +.L_bn_sqr_words_proceed: andcc %o2,-4,%g0 nop bz,pn %icc,.L_bn_sqr_words_tail diff --git a/crypto/bn/asm/sparcv9-gf2m.pl b/crypto/bn/asm/sparcv9-gf2m.pl index ab94cd917c41..dcf11a87a18e 100755 --- a/crypto/bn/asm/sparcv9-gf2m.pl +++ b/crypto/bn/asm/sparcv9-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -18,6 +25,9 @@ # ~100-230% faster than gcc-generated code and ~35-90% faster than # the pure SPARCv9 code path. +$output = pop; +open STDOUT,">$output"; + $locals=16*8; $tab="%l0"; diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl index d8662878006e..b41903af985f 100755 --- a/crypto/bn/asm/sparcv9-mont.pl +++ b/crypto/bn/asm/sparcv9-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -13,7 +20,7 @@ # for undertaken effort are multiple. First of all, UltraSPARC is not # the whole SPARCv9 universe and other VIS-free implementations deserve # optimized code as much. Secondly, newly introduced UltraSPARC T1, -# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes, +# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths, # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with # several integrated RSA/DSA accelerator circuits accessible through # kernel driver [only(*)], but having decent user-land software @@ -23,7 +30,7 @@ # instructions... # (*) Engine accessing the driver in question is on my TODO list. -# For reference, acceleator is estimated to give 6 to 10 times +# For reference, accelerator is estimated to give 6 to 10 times # improvement on single-threaded RSA sign. It should be noted # that 6-10x improvement coefficient does not actually mean # something extraordinary in terms of absolute [single-threaded] @@ -42,6 +49,9 @@ # module still have hidden potential [see TODO list there], which is # estimated to be larger than 20%... +$output = pop; +open STDOUT,">$output"; + # int bn_mul_mont( $rp="%i0"; # BN_ULONG *rp, $ap="%i1"; # const BN_ULONG *ap, @@ -50,10 +60,8 @@ $np="%i3"; # const BN_ULONG *np, $n0="%i4"; # const BN_ULONG *n0, $num="%i5"; # int num); -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } -if ($bits==64) { $bias=2047; $frame=192; } -else { $bias=0; $frame=128; } +$frame="STACK_FRAME"; +$bias="STACK_BIAS"; $car0="%o0"; $car1="%o1"; @@ -76,6 +84,8 @@ $tpj="%l7"; $fname="bn_mul_mont_int"; $code=<<___; +#include "sparc_arch.h" + .section ".text",#alloc,#execinstr .global $fname @@ -105,7 +115,7 @@ $fname: ld [$np],$car1 ! np[0] sub %o7,$bias,%sp ! alloca ld [$np+4],$npj ! np[1] - be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont + be,pt SIZE_T_CC,.Lbn_sqr_mont mov 12,$j mulx $car0,$mul0,$car0 ! ap[0]*bp[0] @@ -255,7 +265,6 @@ $fname: .Ltail: add $np,$num,$np add $rp,$num,$rp - mov $tp,$ap sub %g0,$num,%o7 ! k=-num ba .Lsub subcc %g0,%g0,%g0 ! clear %icc.c @@ -268,15 +277,14 @@ $fname: add %o7,4,%o7 brnz %o7,.Lsub st %o1,[$i] - subc $car2,0,$car2 ! handle upmost overflow bit - and $tp,$car2,$ap - andn $rp,$car2,$np - or $ap,$np,$ap + subccc $car2,0,$car2 ! handle upmost overflow bit sub %g0,$num,%o7 .Lcopy: - ld [$ap+%o7],%o0 ! copy or in-place refresh + ld [$tp+%o7],%o1 ! conditional copy + ld [$rp+%o7],%o0 st %g0,[$tp+%o7] ! zap tp + movcs %icc,%o1,%o0 st %o0,[$rp+%o7] add %o7,4,%o7 brnz %o7,.Lcopy @@ -485,6 +493,9 @@ $code.=<<___; mulx $npj,$mul1,$acc1 add $tpj,$car1,$car1 ld [$np+$j],$npj ! np[j] + srlx $car1,32,$tmp0 + and $car1,$mask,$car1 + add $tmp0,$sbit,$sbit add $acc0,$car1,$car1 ld [$tp+8],$tpj ! tp[j] add $acc1,$car1,$car1 @@ -601,7 +612,7 @@ $code.=<<___; add $tp,8,$tp .type $fname,#function .size $fname,(.-$fname) -.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" +.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>" .align 32 ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl index a14205f2f006..c8f759df9fbd 100755 --- a/crypto/bn/asm/sparcv9a-mont.pl +++ b/crypto/bn/asm/sparcv9a-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -51,21 +58,17 @@ # # Modulo-scheduled inner loops allow to interleave floating point and # integer instructions and minimize Read-After-Write penalties. This -# results in *further* 20-50% perfromance improvement [depending on +# results in *further* 20-50% performance improvement [depending on # key length, more for longer keys] on USI&II cores and 30-80% - on # USIII&IV. +$output = pop; +open STDOUT,">$output"; + $fname="bn_mul_mont_fpu"; -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } - -if ($bits==64) { - $bias=2047; - $frame=192; -} else { - $bias=0; - $frame=128; # 96 rounded up to largest known cache-line -} + +$frame="STACK_FRAME"; +$bias="STACK_BIAS"; $locals=64; # In order to provide for 32-/64-bit ABI duality, I keep integers wider @@ -121,6 +124,8 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load $code=<<___; +#include "sparc_arch.h" + .section ".text",#alloc,#execinstr .global $fname @@ -860,14 +865,14 @@ $fname: restore .type $fname,#function .size $fname,(.-$fname) -.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" +.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>" .align 32 ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; # Below substitution makes it possible to compile without demanding -# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I +# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I # dare to do this, because VIS capability is detected at run-time now # and this routine is not called on CPU not capable to execute it. Do # note that fzeros is not the only VIS dependency! Another dependency diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl index c046a514c873..9cf717e84102 100755 --- a/crypto/bn/asm/via-mont.pl +++ b/crypto/bn/asm/via-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -69,7 +76,7 @@ # dsa 1024 bits 0.001346s 0.001595s 742.7 627.0 # dsa 2048 bits 0.004745s 0.005582s 210.7 179.1 # -# Conclusions: +# Conclusions: # - VIA SDK leaves a *lot* of room for improvement (which this # implementation successfully fills:-); # - 'rep montmul' gives up to >3x performance improvement depending on @@ -81,7 +88,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"via-mont.pl"); +$output = pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); # int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); $func="bn_mul_mont_padlock"; @@ -203,18 +213,15 @@ $sp=&DWP(28,"esp"); &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit &sbb ("eax",0); - &and ("esi","eax"); - ¬ ("eax"); - &mov ("ebp","edi"); - &and ("ebp","eax"); - &or ("esi","ebp"); # tp=carry?tp:rp &mov ("ecx","edx"); # num - &xor ("edx","edx"); # i=0 + &mov ("edx",0); # i=0 &set_label("copy",8); - &mov ("eax",&DWP(0,"esi","edx",4)); - &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp + &mov ("ebx",&DWP(0,"esi","edx",4)); + &mov ("eax",&DWP(0,"edi","edx",4)); + &mov (&DWP(0,"esi","edx",4),"ecx"); # zap tp + &cmovc ("eax","ebx"); &mov (&DWP(0,"edi","edx",4),"eax"); &lea ("edx",&DWP(1,"edx")); # i++ &loop (&label("copy")); @@ -240,3 +247,5 @@ $sp=&DWP(28,"esp"); &asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/crypto/bn/asm/vis3-mont.pl b/crypto/bn/asm/vis3-mont.pl index 263ac02b6f45..04833a0c876d 100755 --- a/crypto/bn/asm/vis3-mont.pl +++ b/crypto/bn/asm/vis3-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -9,7 +16,7 @@ # October 2012. # -# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and +# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and # onward. There are three new instructions used here: umulxhi, # addxc[cc] and initializing store. On T3 RSA private key operations # are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key @@ -18,16 +25,20 @@ # for reference purposes, because T4 has dedicated Montgomery # multiplication and squaring *instructions* that deliver even more. -$bits=32; -for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } -if ($bits==64) { $bias=2047; $frame=192; } -else { $bias=0; $frame=112; } +$output = pop; +open STDOUT,">$output"; + +$frame = "STACK_FRAME"; +$bias = "STACK_BIAS"; + +$code.=<<___; +#include "sparc_arch.h" -$code.=<<___ if ($bits==64); +#ifdef __arch64__ .register %g2,#scratch .register %g3,#scratch -___ -$code.=<<___; +#endif + .section ".text",#alloc,#execinstr ___ @@ -299,23 +310,23 @@ $code.=<<___; sub $anp, $num, $anp sub $rp, $num, $rp - subc $ovf, %g0, $ovf ! handle upmost overflow bit - and $tp, $ovf, $ap - andn $rp, $ovf, $np - or $np, $ap, $ap ! ap=borrow?tp:rp + subccc $ovf, %g0, $ovf ! handle upmost overflow bit ba .Lcopy sub $num, 8, $cnt .align 16 -.Lcopy: ! copy or in-place refresh - ld [$ap+0], $t2 - ld [$ap+4], $t3 - add $ap, 8, $ap +.Lcopy: ! conditional copy + ld [$tp+0], $t0 + ld [$tp+4], $t1 + ld [$rp+0], $t2 + ld [$rp+4], $t3 stx %g0, [$tp] ! zap add $tp, 8, $tp stx %g0, [$anp] ! zap stx %g0, [$anp+8] add $anp, 16, $anp + movcs %icc, $t0, $t2 + movcs %icc, $t1, $t3 st $t3, [$rp+0] ! flip order st $t2, [$rp+4] add $rp, 8, $rp @@ -333,7 +344,7 @@ ___ # Purpose of these subroutines is to explicitly encode VIS instructions, # so that one can compile the module without having to specify VIS -# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. +# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. # Idea is to reserve for option to produce "universal" binary and let # programmer detect if current CPU is VIS capable at run-time. sub unvis3 { diff --git a/crypto/bn/asm/x86-gf2m.pl b/crypto/bn/asm/x86-gf2m.pl index b57953027298..d03efcc75023 100755 --- a/crypto/bn/asm/x86-gf2m.pl +++ b/crypto/bn/asm/x86-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -36,7 +43,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); +$output = pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); $sse2=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -142,7 +152,7 @@ $R="mm0"; &xor ($a4,$a2); # a2=a4^a2^a4 &mov (&DWP(5*4,"esp"),$a1); # a1^a4 &xor ($a4,$a1); # a1^a2^a4 - &sar (@i[1],31); # broardcast 30th bit + &sar (@i[1],31); # broadcast 30th bit &and ($lo,$b); &mov (&DWP(6*4,"esp"),$a2); # a2^a4 &and (@i[1],$b); @@ -311,3 +321,5 @@ if ($sse2) { &asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 1c4003efc20a..7ba2133ac9c3 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -30,7 +37,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],$0); +$output = pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0]); $sse2=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -68,7 +78,7 @@ $frame=32; # size of above frame rounded up to 16n &lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2)) &neg ("edi"); - # minimize cache contention by arraning 2K window between stack + # minimize cache contention by arranging 2K window between stack # pointer and ap argument [np is also position sensitive vector, # but it's assumed to be near ap, as it's allocated at ~same # time]. @@ -84,7 +94,9 @@ $frame=32; # size of above frame rounded up to 16n &and ("ebp",-64); # align to cache line - # Some OSes, *cough*-dows, insist on stack being "wired" to + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on @@ -289,7 +301,7 @@ if (0) { &xor ("eax","eax"); # signal "not fast enough [yet]" &jmp (&label("just_leave")); # While the below code provides competitive performance for - # all key lengthes on modern Intel cores, it's still more + # all key lengths on modern Intel cores, it's still more # than 10% slower for 4096-bit key elsewhere:-( "Competitive" # means compared to the original integer-only assembler. # 512-bit RSA sign is better by ~40%, but that's about all @@ -592,16 +604,18 @@ $sbit=$num; &jge (&label("sub")); &sbb ("eax",0); # handle upmost overflow bit - &and ($tp,"eax"); - ¬ ("eax"); - &mov ($np,$rp); - &and ($np,"eax"); - &or ($tp,$np); # tp=carry?tp:rp - -&set_label("copy",16); # copy or in-place refresh - &mov ("eax",&DWP(0,$tp,$num,4)); - &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov ("edx",-1); + &xor ("edx","eax"); + &jmp (&label("copy")); + +&set_label("copy",16); # conditional copy + &mov ($tp,&DWP($frame,"esp",$num,4)); + &mov ($np,&DWP(0,$rp,$num,4)); &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &and ($tp,"eax"); + &and ($np,"edx"); + &or ($np,$tp); + &mov (&DWP(0,$rp,$num,4),$np); &dec ($num); &jge (&label("copy")); @@ -613,3 +627,5 @@ $sbit=$num; &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); + +close STDOUT; diff --git a/crypto/bn/asm/x86.pl b/crypto/bn/asm/x86.pl deleted file mode 100644 index 1bc4f1bb2747..000000000000 --- a/crypto/bn/asm/x86.pl +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/local/bin/perl - -push(@INC,"perlasm","../../perlasm"); -require "x86asm.pl"; - -require("x86/mul_add.pl"); -require("x86/mul.pl"); -require("x86/sqr.pl"); -require("x86/div.pl"); -require("x86/add.pl"); -require("x86/sub.pl"); -require("x86/comba.pl"); - -&asm_init($ARGV[0],$0); - -&bn_mul_add_words("bn_mul_add_words"); -&bn_mul_words("bn_mul_words"); -&bn_sqr_words("bn_sqr_words"); -&bn_div_words("bn_div_words"); -&bn_add_words("bn_add_words"); -&bn_sub_words("bn_sub_words"); -&bn_mul_comba("bn_mul_comba8",8); -&bn_mul_comba("bn_mul_comba4",4); -&bn_sqr_comba("bn_sqr_comba8",8); -&bn_sqr_comba("bn_sqr_comba4",4); - -&asm_finish(); - diff --git a/crypto/bn/asm/x86/add.pl b/crypto/bn/asm/x86/add.pl deleted file mode 100644 index 0b5cf583e37f..000000000000 --- a/crypto/bn/asm/x86/add.pl +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_add_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $a="esi"; - $b="edi"; - $c="eax"; - $r="ebx"; - $tmp1="ecx"; - $tmp2="edx"; - $num="ebp"; - - &mov($r,&wparam(0)); # get r - &mov($a,&wparam(1)); # get a - &mov($b,&wparam(2)); # get b - &mov($num,&wparam(3)); # get num - &xor($c,$c); # clear carry - &and($num,0xfffffff8); # num / 8 - - &jz(&label("aw_finish")); - - &set_label("aw_loop",0); - for ($i=0; $i<8; $i++) - { - &comment("Round $i"); - - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0)); # *b - &add($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &add($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - } - - &comment(""); - &add($a,32); - &add($b,32); - &add($r,32); - &sub($num,8); - &jnz(&label("aw_loop")); - - &set_label("aw_finish",0); - &mov($num,&wparam(3)); # get num - &and($num,7); - &jz(&label("aw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0));# *b - &add($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &add($tmp1,$tmp2); - &adc($c,0); - &dec($num) if ($i != 6); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *a - &jz(&label("aw_end")) if ($i != 6); - } - &set_label("aw_end",0); - -# &mov("eax",$c); # $c is "eax" - - &function_end($name); - } - -1; diff --git a/crypto/bn/asm/x86/comba.pl b/crypto/bn/asm/x86/comba.pl deleted file mode 100644 index 22912536293d..000000000000 --- a/crypto/bn/asm/x86/comba.pl +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub mul_add_c - { - local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; - - # pos == -1 if eax and edx are pre-loaded, 0 to load from next - # words, and 1 if load return value - - &comment("mul a[$ai]*b[$bi]"); - - # "eax" and "edx" will always be pre-loaded. - # &mov("eax",&DWP($ai*4,$a,"",0)) ; - # &mov("edx",&DWP($bi*4,$b,"",0)); - - &mul("edx"); - &add($c0,"eax"); - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a - &mov("eax",&wparam(0)) if $pos > 0; # load r[] - ### - &adc($c1,"edx"); - &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b - &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b - ### - &adc($c2,0); - # is pos > 1, it means it is the last loop - &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[]; - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a - } - -sub sqr_add_c - { - local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; - - # pos == -1 if eax and edx are pre-loaded, 0 to load from next - # words, and 1 if load return value - - &comment("sqr a[$ai]*a[$bi]"); - - # "eax" and "edx" will always be pre-loaded. - # &mov("eax",&DWP($ai*4,$a,"",0)) ; - # &mov("edx",&DWP($bi*4,$b,"",0)); - - if ($ai == $bi) - { &mul("eax");} - else - { &mul("edx");} - &add($c0,"eax"); - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a - ### - &adc($c1,"edx"); - &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb); - ### - &adc($c2,0); - # is pos > 1, it means it is the last loop - &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b - } - -sub sqr_add_c2 - { - local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_; - - # pos == -1 if eax and edx are pre-loaded, 0 to load from next - # words, and 1 if load return value - - &comment("sqr a[$ai]*a[$bi]"); - - # "eax" and "edx" will always be pre-loaded. - # &mov("eax",&DWP($ai*4,$a,"",0)) ; - # &mov("edx",&DWP($bi*4,$a,"",0)); - - if ($ai == $bi) - { &mul("eax");} - else - { &mul("edx");} - &add("eax","eax"); - ### - &adc("edx","edx"); - ### - &adc($c2,0); - &add($c0,"eax"); - &adc($c1,"edx"); - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a - &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b - &adc($c2,0); - &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[]; - &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb); - ### - } - -sub bn_mul_comba - { - local($name,$num)=@_; - local($a,$b,$c0,$c1,$c2); - local($i,$as,$ae,$bs,$be,$ai,$bi); - local($tot,$end); - - &function_begin_B($name,""); - - $c0="ebx"; - $c1="ecx"; - $c2="ebp"; - $a="esi"; - $b="edi"; - - $as=0; - $ae=0; - $bs=0; - $be=0; - $tot=$num+$num-1; - - &push("esi"); - &mov($a,&wparam(1)); - &push("edi"); - &mov($b,&wparam(2)); - &push("ebp"); - &push("ebx"); - - &xor($c0,$c0); - &mov("eax",&DWP(0,$a,"",0)); # load the first word - &xor($c1,$c1); - &mov("edx",&DWP(0,$b,"",0)); # load the first second - - for ($i=0; $i<$tot; $i++) - { - $ai=$as; - $bi=$bs; - $end=$be+1; - - &comment("################## Calculate word $i"); - - for ($j=$bs; $j<$end; $j++) - { - &xor($c2,$c2) if ($j == $bs); - if (($j+1) == $end) - { - $v=1; - $v=2 if (($i+1) == $tot); - } - else - { $v=0; } - if (($j+1) != $end) - { - $na=($ai-1); - $nb=($bi+1); - } - else - { - $na=$as+($i < ($num-1)); - $nb=$bs+($i >= ($num-1)); - } -#printf STDERR "[$ai,$bi] -> [$na,$nb]\n"; - &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb); - if ($v) - { - &comment("saved r[$i]"); - # &mov("eax",&wparam(0)); - # &mov(&DWP($i*4,"eax","",0),$c0); - ($c0,$c1,$c2)=($c1,$c2,$c0); - } - $ai--; - $bi++; - } - $as++ if ($i < ($num-1)); - $ae++ if ($i >= ($num-1)); - - $bs++ if ($i >= ($num-1)); - $be++ if ($i < ($num-1)); - } - &comment("save r[$i]"); - # &mov("eax",&wparam(0)); - &mov(&DWP($i*4,"eax","",0),$c0); - - &pop("ebx"); - &pop("ebp"); - &pop("edi"); - &pop("esi"); - &ret(); - &function_end_B($name); - } - -sub bn_sqr_comba - { - local($name,$num)=@_; - local($r,$a,$c0,$c1,$c2)=@_; - local($i,$as,$ae,$bs,$be,$ai,$bi); - local($b,$tot,$end,$half); - - &function_begin_B($name,""); - - $c0="ebx"; - $c1="ecx"; - $c2="ebp"; - $a="esi"; - $r="edi"; - - &push("esi"); - &push("edi"); - &push("ebp"); - &push("ebx"); - &mov($r,&wparam(0)); - &mov($a,&wparam(1)); - &xor($c0,$c0); - &xor($c1,$c1); - &mov("eax",&DWP(0,$a,"",0)); # load the first word - - $as=0; - $ae=0; - $bs=0; - $be=0; - $tot=$num+$num-1; - - for ($i=0; $i<$tot; $i++) - { - $ai=$as; - $bi=$bs; - $end=$be+1; - - &comment("############### Calculate word $i"); - for ($j=$bs; $j<$end; $j++) - { - &xor($c2,$c2) if ($j == $bs); - if (($ai-1) < ($bi+1)) - { - $v=1; - $v=2 if ($i+1) == $tot; - } - else - { $v=0; } - if (!$v) - { - $na=$ai-1; - $nb=$bi+1; - } - else - { - $na=$as+($i < ($num-1)); - $nb=$bs+($i >= ($num-1)); - } - if ($ai == $bi) - { - &sqr_add_c($r,$a,$ai,$bi, - $c0,$c1,$c2,$v,$i,$na,$nb); - } - else - { - &sqr_add_c2($r,$a,$ai,$bi, - $c0,$c1,$c2,$v,$i,$na,$nb); - } - if ($v) - { - &comment("saved r[$i]"); - #&mov(&DWP($i*4,$r,"",0),$c0); - ($c0,$c1,$c2)=($c1,$c2,$c0); - last; - } - $ai--; - $bi++; - } - $as++ if ($i < ($num-1)); - $ae++ if ($i >= ($num-1)); - - $bs++ if ($i >= ($num-1)); - $be++ if ($i < ($num-1)); - } - &mov(&DWP($i*4,$r,"",0),$c0); - &pop("ebx"); - &pop("ebp"); - &pop("edi"); - &pop("esi"); - &ret(); - &function_end_B($name); - } - -1; diff --git a/crypto/bn/asm/x86/div.pl b/crypto/bn/asm/x86/div.pl deleted file mode 100644 index 0e90152caa95..000000000000 --- a/crypto/bn/asm/x86/div.pl +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_div_words - { - local($name)=@_; - - &function_begin($name,""); - &mov("edx",&wparam(0)); # - &mov("eax",&wparam(1)); # - &mov("ebx",&wparam(2)); # - &div("ebx"); - &function_end($name); - } -1; diff --git a/crypto/bn/asm/x86/f b/crypto/bn/asm/x86/f deleted file mode 100644 index 22e411222431..000000000000 --- a/crypto/bn/asm/x86/f +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - diff --git a/crypto/bn/asm/x86/mul.pl b/crypto/bn/asm/x86/mul.pl deleted file mode 100644 index 674cb9b05512..000000000000 --- a/crypto/bn/asm/x86/mul.pl +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_mul_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $Low="eax"; - $High="edx"; - $a="ebx"; - $w="ecx"; - $r="edi"; - $c="esi"; - $num="ebp"; - - &xor($c,$c); # clear carry - &mov($r,&wparam(0)); # - &mov($a,&wparam(1)); # - &mov($num,&wparam(2)); # - &mov($w,&wparam(3)); # - - &and($num,0xfffffff8); # num / 8 - &jz(&label("mw_finish")); - - &set_label("mw_loop",0); - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - - &mov("eax",&DWP($i,$a,"",0)); # *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - # XXX - - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); - - &mov($c,"edx"); # c= H(t); - } - - &comment(""); - &add($a,32); - &add($r,32); - &sub($num,8); - &jz(&label("mw_finish")); - &jmp(&label("mw_loop")); - - &set_label("mw_finish",0); - &mov($num,&wparam(2)); # get num - &and($num,7); - &jnz(&label("mw_finish2")); - &jmp(&label("mw_end")); - - &set_label("mw_finish2",1); - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0));# *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - # XXX - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t); - &mov($c,"edx"); # c= H(t); - &dec($num) if ($i != 7-1); - &jz(&label("mw_end")) if ($i != 7-1); - } - &set_label("mw_end",0); - &mov("eax",$c); - - &function_end($name); - } - -1; diff --git a/crypto/bn/asm/x86/mul_add.pl b/crypto/bn/asm/x86/mul_add.pl deleted file mode 100644 index 61830d3a906a..000000000000 --- a/crypto/bn/asm/x86/mul_add.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_mul_add_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $Low="eax"; - $High="edx"; - $a="ebx"; - $w="ebp"; - $r="edi"; - $c="esi"; - - &xor($c,$c); # clear carry - &mov($r,&wparam(0)); # - - &mov("ecx",&wparam(2)); # - &mov($a,&wparam(1)); # - - &and("ecx",0xfffffff8); # num / 8 - &mov($w,&wparam(3)); # - - &push("ecx"); # Up the stack for a tmp variable - - &jz(&label("maw_finish")); - - &set_label("maw_loop",0); - - &mov(&swtmp(0),"ecx"); # - - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - - &mov("eax",&DWP($i,$a,"",0)); # *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+= *r - &mov($c,&DWP($i,$r,"",0)); # L(t)+= *r - &adc("edx",0); # H(t)+=carry - &add("eax",$c); # L(t)+=c - &adc("edx",0); # H(t)+=carry - &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); - } - - &comment(""); - &mov("ecx",&swtmp(0)); # - &add($a,32); - &add($r,32); - &sub("ecx",8); - &jnz(&label("maw_loop")); - - &set_label("maw_finish",0); - &mov("ecx",&wparam(2)); # get num - &and("ecx",7); - &jnz(&label("maw_finish2")); # helps branch prediction - &jmp(&label("maw_end")); - - &set_label("maw_finish2",1); - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0));# *a - &mul($w); # *a * w - &add("eax",$c); # L(t)+=c - &mov($c,&DWP($i*4,$r,"",0)); # L(t)+= *r - &adc("edx",0); # H(t)+=carry - &add("eax",$c); - &adc("edx",0); # H(t)+=carry - &dec("ecx") if ($i != 7-1); - &mov(&DWP($i*4,$r,"",0),"eax"); # *r= L(t); - &mov($c,"edx"); # c= H(t); - &jz(&label("maw_end")) if ($i != 7-1); - } - &set_label("maw_end",0); - &mov("eax",$c); - - &pop("ecx"); # clear variable from - - &function_end($name); - } - -1; diff --git a/crypto/bn/asm/x86/sqr.pl b/crypto/bn/asm/x86/sqr.pl deleted file mode 100644 index 1f90993cf689..000000000000 --- a/crypto/bn/asm/x86/sqr.pl +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_sqr_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $r="esi"; - $a="edi"; - $num="ebx"; - - &mov($r,&wparam(0)); # - &mov($a,&wparam(1)); # - &mov($num,&wparam(2)); # - - &and($num,0xfffffff8); # num / 8 - &jz(&label("sw_finish")); - - &set_label("sw_loop",0); - for ($i=0; $i<32; $i+=4) - { - &comment("Round $i"); - &mov("eax",&DWP($i,$a,"",0)); # *a - # XXX - &mul("eax"); # *a * *a - &mov(&DWP($i*2,$r,"",0),"eax"); # - &mov(&DWP($i*2+4,$r,"",0),"edx");# - } - - &comment(""); - &add($a,32); - &add($r,64); - &sub($num,8); - &jnz(&label("sw_loop")); - - &set_label("sw_finish",0); - &mov($num,&wparam(2)); # get num - &and($num,7); - &jz(&label("sw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov("eax",&DWP($i*4,$a,"",0)); # *a - # XXX - &mul("eax"); # *a * *a - &mov(&DWP($i*8,$r,"",0),"eax"); # - &dec($num) if ($i != 7-1); - &mov(&DWP($i*8+4,$r,"",0),"edx"); - &jz(&label("sw_end")) if ($i != 7-1); - } - &set_label("sw_end",0); - - &function_end($name); - } - -1; diff --git a/crypto/bn/asm/x86/sub.pl b/crypto/bn/asm/x86/sub.pl deleted file mode 100644 index 837b0e1b078d..000000000000 --- a/crypto/bn/asm/x86/sub.pl +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/local/bin/perl -# x86 assember - -sub bn_sub_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $a="esi"; - $b="edi"; - $c="eax"; - $r="ebx"; - $tmp1="ecx"; - $tmp2="edx"; - $num="ebp"; - - &mov($r,&wparam(0)); # get r - &mov($a,&wparam(1)); # get a - &mov($b,&wparam(2)); # get b - &mov($num,&wparam(3)); # get num - &xor($c,$c); # clear carry - &and($num,0xfffffff8); # num / 8 - - &jz(&label("aw_finish")); - - &set_label("aw_loop",0); - for ($i=0; $i<8; $i++) - { - &comment("Round $i"); - - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0)); # *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - } - - &comment(""); - &add($a,32); - &add($b,32); - &add($r,32); - &sub($num,8); - &jnz(&label("aw_loop")); - - &set_label("aw_finish",0); - &mov($num,&wparam(3)); # get num - &and($num,7); - &jz(&label("aw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0));# *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &dec($num) if ($i != 6); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *a - &jz(&label("aw_end")) if ($i != 6); - } - &set_label("aw_end",0); - -# &mov("eax",$c); # $c is "eax" - - &function_end($name); - } - -1; diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c index 1729b479d43e..d38f33716477 100644 --- a/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/bn/asm/x86_64-gcc.c @@ -1,3 +1,12 @@ +/* + * Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + #include "../bn_lcl.h" #if !(defined(__GNUC__) && __GNUC__>=2) # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */ @@ -5,7 +14,7 @@ /*- * x86_64 BIGNUM accelerator version 0.1, December 2002. * - * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL + * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL * project. * * Rights for redistribution and usage in source and binary forms are @@ -111,7 +120,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG c1 = 0; if (num <= 0) - return (c1); + return c1; while (num & ~3) { mul_add(rp[0], ap[0], w, c1); @@ -133,7 +142,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, return c1; } - return (c1); + return c1; } BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) @@ -141,7 +150,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) BN_ULONG c1 = 0; if (num <= 0) - return (c1); + return c1; while (num & ~3) { mul(rp[0], ap[0], w, c1); @@ -161,7 +170,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) return c1; mul(rp[2], ap[2], w, c1); } - return (c1); + return c1; } void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) @@ -216,9 +225,10 @@ BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, " adcq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" " lea 1(%2),%2 \n" - " loop 1b \n" - " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), - "+r"(i) + " dec %1 \n" + " jnz 1b \n" + " sbbq %0,%0 \n" + :"=&r" (ret), "+c"(n), "+r"(i) :"r"(rp), "r"(ap), "r"(bp) :"cc", "memory"); @@ -242,9 +252,10 @@ BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, " sbbq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" " lea 1(%2),%2 \n" - " loop 1b \n" - " sbbq %0,%0 \n":"=&r" (ret), "+c"(n), - "+r"(i) + " dec %1 \n" + " jnz 1b \n" + " sbbq %0,%0 \n" + :"=&r" (ret), "+c"(n), "+r"(i) :"r"(rp), "r"(ap), "r"(bp) :"cc", "memory"); @@ -259,7 +270,7 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) int c = 0; if (n <= 0) - return ((BN_ULONG)0); + return (BN_ULONG)0; for (;;) { t1 = a[0]; @@ -298,7 +309,7 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) b += 4; r += 4; } - return (c); + return c; } # endif diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl index 42bbec2fb7ef..0fd6e985d7b0 100755 --- a/crypto/bn/asm/x86_64-gf2m.pl +++ b/crypto/bn/asm/x86_64-gf2m.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -31,7 +38,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; ($lo,$hi)=("%rax","%rdx"); $a=$lo; @@ -47,7 +54,9 @@ $code.=<<___; .type _mul_1x1,\@abi-omnipotent .align 16 _mul_1x1: +.cfi_startproc sub \$128+8,%rsp +.cfi_adjust_cfa_offset 128+8 mov \$-1,$a1 lea ($a,$a),$i0 shr \$3,$a1 @@ -59,7 +68,7 @@ _mul_1x1: sar \$63,$i0 # broadcast 62nd bit lea (,$a1,4),$a4 and $b,$a - sar \$63,$i1 # boardcast 61st bit + sar \$63,$i1 # broadcast 61st bit mov $a,$hi # $a is $lo shl \$63,$lo and $b,$i0 @@ -153,8 +162,10 @@ $code.=<<___; xor $i1,$hi add \$128+8,%rsp +.cfi_adjust_cfa_offset -128-8 ret .Lend_mul_1x1: +.cfi_endproc .size _mul_1x1,.-_mul_1x1 ___ @@ -167,8 +178,10 @@ $code.=<<___; .type bn_GF2m_mul_2x2,\@abi-omnipotent .align 16 bn_GF2m_mul_2x2: - mov OPENSSL_ia32cap_P(%rip),%rax - bt \$33,%rax +.cfi_startproc + mov %rsp,%rax + mov OPENSSL_ia32cap_P(%rip),%r10 + bt \$33,%r10 jnc .Lvanilla_mul_2x2 movq $a1,%xmm0 @@ -203,6 +216,7 @@ $code.=<<___; .align 16 .Lvanilla_mul_2x2: lea -8*17(%rsp),%rsp +.cfi_adjust_cfa_offset 8*17 ___ $code.=<<___ if ($win64); mov `8*17+40`(%rsp),$b0 @@ -211,10 +225,15 @@ $code.=<<___ if ($win64); ___ $code.=<<___; mov %r14,8*10(%rsp) +.cfi_rel_offset %r14,8*10 mov %r13,8*11(%rsp) +.cfi_rel_offset %r13,8*11 mov %r12,8*12(%rsp) +.cfi_rel_offset %r12,8*12 mov %rbp,8*13(%rsp) +.cfi_rel_offset %rbp,8*13 mov %rbx,8*14(%rsp) +.cfi_rel_offset %rbx,8*14 .Lbody_mul_2x2: mov $rp,32(%rsp) # save the arguments mov $a1,40(%rsp) @@ -262,10 +281,15 @@ $code.=<<___; mov $lo,8(%rbp) mov 8*10(%rsp),%r14 +.cfi_restore %r14 mov 8*11(%rsp),%r13 +.cfi_restore %r13 mov 8*12(%rsp),%r12 +.cfi_restore %r12 mov 8*13(%rsp),%rbp +.cfi_restore %rbp mov 8*14(%rsp),%rbx +.cfi_restore %rbx ___ $code.=<<___ if ($win64); mov 8*15(%rsp),%rdi @@ -273,8 +297,11 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea 8*17(%rsp),%rsp +.cfi_adjust_cfa_offset -8*17 +.Lepilogue_mul_2x2: ret .Lend_mul_2x2: +.cfi_endproc .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 .asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>" .align 16 @@ -305,13 +332,19 @@ se_handler: pushfq sub \$64,%rsp - mov 152($context),%rax # pull context->Rsp + mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lbody_mul_2x2(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lin_prologue + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue_mul_2x2(%rip),%r10 + cmp %r10,%rbx # context->Rip>="epilogue" label + jae .Lin_prologue + mov 8*10(%rax),%r14 # mimic epilogue mov 8*11(%rax),%r13 mov 8*12(%rax),%r12 @@ -328,8 +361,9 @@ se_handler: mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 -.Lin_prologue: lea 8*17(%rax),%rax + +.Lin_prologue: mov %rax,152($context) # restore context->Rsp mov 40($disp),%rdi # disp->ContextRecord diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 80492d8e6381..c051135e30dd 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -50,7 +57,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -97,8 +104,10 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: +.cfi_startproc mov ${num}d,${num}d mov %rsp,%rax +.cfi_def_cfa_register %rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d @@ -117,11 +126,17 @@ $code.=<<___; .align 16 .Lmul_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 neg $num mov %rsp,%r11 @@ -129,7 +144,9 @@ $code.=<<___; neg $num # restore $num and \$-1024,%r10 # minimize TLB usage - # Some OSes, *cough*-dows, insist on stack being "wired" to + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on @@ -152,6 +169,7 @@ $code.=<<___; .Lmul_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: mov $bp,%r12 # reassign $bp ___ @@ -293,45 +311,54 @@ $code.=<<___; xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] - lea (%rsp),$ap # borrow ap for tp mov $num,$j # j=num - jmp .Lsub + .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] - mov 8($ap,$i,8),%rax # tp[i+1] + mov 8(%rsp,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - dec $j # doesnn't affect CF! + dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx # not %rax xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax - mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx + mov $num,(%rsp,$i,8) # zap temporary vector + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size bn_mul_mont,.-bn_mul_mont ___ {{{ @@ -341,8 +368,10 @@ $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: +.cfi_startproc mov ${num}d,${num}d mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -352,11 +381,17 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 neg $num mov %rsp,%r11 @@ -380,6 +415,7 @@ $code.=<<___; .Lmul4x_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp @@ -686,10 +722,10 @@ ___ my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp + lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] - pxor %xmm0,%xmm0 mov 8(%rsp),@ri[1] # tp[1] - shr \$2,$num # num/=4 + shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! @@ -697,9 +733,7 @@ $code.=<<___; mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] - lea -1($num),$j # j=num/4-1 - jmp .Lsub4x -.align 16 + .Lsub4x: mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] @@ -714,7 +748,7 @@ $code.=<<___; mov 56($ap,$i,8),@ri[3] sbb 40($np,$i,8),@ri[1] lea 4($i),$i # i++ - dec $j # doesnn't affect CF! + dec $j # doesn't affect CF! jnz .Lsub4x mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] @@ -726,48 +760,58 @@ $code.=<<___; sbb \$0,@ri[0] # handle upmost overflow bit mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] - xor $i,$i # i=0 - and @ri[0],$ap - not @ri[0] - mov $rp,$np - and @ri[0],$np - lea -1($num),$j - or $np,$ap # ap=borrow?tp:rp - - movdqu ($ap),%xmm1 - movdqa %xmm0,(%rsp) - movdqu %xmm1,($rp) + pxor %xmm0,%xmm0 + movq @ri[0],%xmm4 + pcmpeqd %xmm5,%xmm5 + pshufd \$0,%xmm4,%xmm4 + mov $num,$j + pxor %xmm4,%xmm5 + shr \$2,$j # j=num/4 + xor %eax,%eax # i=0 + jmp .Lcopy4x .align 16 -.Lcopy4x: # copy or in-place refresh - movdqu 16($ap,$i),%xmm2 - movdqu 32($ap,$i),%xmm1 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) - movdqa %xmm0,32(%rsp,$i) - movdqu %xmm1,32($rp,$i) - lea 32($i),$i +.Lcopy4x: # conditional copy + movdqa (%rsp,%rax),%xmm1 + movdqu ($rp,%rax),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax),%xmm3 + movdqa %xmm0,(%rsp,%rax) + por %xmm2,%xmm1 + movdqu 16($rp,%rax),%xmm2 + movdqu %xmm1,($rp,%rax) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax) + por %xmm2,%xmm3 + movdqu %xmm3,16($rp,%rax) + lea 32(%rax),%rax dec $j jnz .Lcopy4x - - shl \$2,$num - movdqu 16($ap,$i),%xmm2 - movdqa %xmm0,16(%rsp,$i) - movdqu %xmm2,16($rp,$i) ___ } $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi, 8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret +.cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont ___ }}} @@ -795,14 +839,22 @@ $code.=<<___; .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax .Lsqr8x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lsqr8x_prologue: mov ${num}d,%r10d @@ -858,6 +910,7 @@ bn_sqr8x_mont: mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lsqr8x_body: movq $nptr, %xmm2 # save pointer to modulus @@ -927,6 +980,7 @@ $code.=<<___; pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 @@ -956,14 +1010,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lsqr8x_epilogue: ret +.cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ }}} @@ -975,14 +1037,22 @@ $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes @@ -1028,6 +1098,7 @@ bn_mulx4x_mont: mov $n0, 24(%rsp) # save *n0 mov $rp, 32(%rsp) # save $rp mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 mov $num,48(%rsp) # inner counter jmp .Lmulx4x_body @@ -1277,6 +1348,7 @@ $code.=<<___; pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 @@ -1306,14 +1378,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret +.cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont ___ }}} @@ -1392,12 +1472,12 @@ sqr_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label - cmp %r10,%rbx # context->Rip<.Lsqr_body + cmp %r10,%rbx # context->Rip<.Lsqr_prologue jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # body label - cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_pop_regs mov 152($context),%rax # pull context->Rsp diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 42178e455a98..ad6e8ada3ce7 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -35,7 +42,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -86,8 +93,10 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: +.cfi_startproc mov ${num}d,${num}d mov %rsp,%rax +.cfi_def_cfa_register %rax test \$7,${num}d jnz .Lmul_enter ___ @@ -101,11 +110,17 @@ $code.=<<___; .Lmul_enter: movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 neg $num mov %rsp,%r11 @@ -113,7 +128,9 @@ $code.=<<___; neg $num # restore $num and \$-1024,%r10 # minimize TLB usage - # Some OSes, *cough*-dows, insist on stack being "wired" to + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on @@ -136,6 +153,7 @@ $code.=<<___; lea .Linc(%rip),%r10 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) @@ -401,38 +419,48 @@ $code.=<<___; mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - dec $j # doesnn't affect CF! + dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 ___ {{{ @@ -442,8 +470,10 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: +.cfi_startproc .byte 0x67 mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -453,11 +483,17 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lmul4x_prologue: .byte 0x67 @@ -513,22 +549,32 @@ $code.=<<___; neg $num mov %rax,40(%rsp) +.cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent @@ -1040,7 +1086,7 @@ my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 - # int pwr + # int pwr my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); @@ -1052,7 +1098,9 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -1062,11 +1110,17 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lpower5_prologue: shl \$3,${num}d # convert $num to bytes @@ -1117,7 +1171,7 @@ $code.=<<___; ja .Lpwr_page_walk .Lpwr_page_walk_done: - mov $num,%r10 + mov $num,%r10 neg $num ############################################################## @@ -1131,6 +1185,7 @@ $code.=<<___; # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr @@ -1157,16 +1212,25 @@ $code.=<<___; call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: ret +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -2026,7 +2090,7 @@ __bn_post4x_internal: jnz .Lsqr4x_sub mov $num,%r10 # prepare for back-to-back call - neg $num # restore $num + neg $num # restore $num ret .size __bn_post4x_internal,.-__bn_post4x_internal ___ @@ -2046,14 +2110,22 @@ bn_from_montgomery: .type bn_from_mont8x,\@function,6 .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lfrom_prologue: shl \$3,${num}d # convert $num to bytes @@ -2118,6 +2190,7 @@ bn_from_mont8x: # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lfrom_body: mov $num,%r11 lea 48(%rsp),%rax @@ -2161,7 +2234,6 @@ $code.=<<___ if ($addx); pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 @@ -2173,11 +2245,12 @@ $code.=<<___; pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 movdqa %xmm0,16*0(%rax) movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) @@ -2188,14 +2261,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: ret +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x ___ } @@ -2208,14 +2289,22 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes @@ -2249,7 +2338,7 @@ bn_mulx4x_mont_gather5: mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp -.Lmulx4xsp_done: +.Lmulx4xsp_done: and \$-64,%rbp # ensure alignment mov %rsp,%r11 sub %rbp,%r11 @@ -2281,21 +2370,31 @@ bn_mulx4x_mont_gather5: # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret +.cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent @@ -2323,7 +2422,7 @@ my $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 - lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128($bp),$bptr # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index @@ -2673,14 +2772,22 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax .Lpowerx5_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes @@ -2731,7 +2838,7 @@ bn_powerx5: ja .Lpwrx_page_walk .Lpwrx_page_walk_done: - mov $num,%r10 + mov $num,%r10 neg $num ############################################################## @@ -2752,6 +2859,7 @@ bn_powerx5: movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal @@ -2774,17 +2882,26 @@ bn_powerx5: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret +.cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal @@ -3668,8 +3785,8 @@ mul_handler: jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label + lea (%rsi,%r10),%r10 # beginning of body label + cmp %r10,%rbx # context->Rip<body label jb .Lcommon_pop_regs mov 152($context),%rax # pull context->Rsp |