aboutsummaryrefslogtreecommitdiff
path: root/crypto/bn/asm
diff options
context:
space:
mode:
authorSimon L. B. Nielsen <simon@FreeBSD.org>2010-02-28 18:49:43 +0000
committerSimon L. B. Nielsen <simon@FreeBSD.org>2010-02-28 18:49:43 +0000
commitf7a1b4761cf3f798e1b42d703d38221b47ce1eec (patch)
tree21770f10e7f26d05fc9b0fa96a7b6d7b107552c5 /crypto/bn/asm
parentf0c2a617dfb432d01bc5a716eb18dae12e6b45e3 (diff)
downloadsrc-f7a1b4761cf3f798e1b42d703d38221b47ce1eec.tar.gz
src-f7a1b4761cf3f798e1b42d703d38221b47ce1eec.zip
Import OpenSSL 0.9.8m.vendor/openssl/0.9.8m
Notes
Notes: svn path=/vendor-crypto/openssl/dist/; revision=204477 svn path=/vendor-crypto/openssl/0.9.8m/; revision=204478; tag=vendor/openssl/0.9.8m
Diffstat (limited to 'crypto/bn/asm')
-rwxr-xr-xcrypto/bn/asm/alpha-mont.pl317
-rwxr-xr-xcrypto/bn/asm/armv4-mont.pl200
-rwxr-xr-xcrypto/bn/asm/mips3-mont.pl327
-rwxr-xr-xcrypto/bn/asm/ppc-mont.pl323
-rwxr-xr-xcrypto/bn/asm/ppc64-mont.pl918
-rwxr-xr-xcrypto/bn/asm/s390x-mont.pl225
-rwxr-xr-xcrypto/bn/asm/s390x.S678
-rwxr-xr-xcrypto/bn/asm/sparcv9-mont.pl606
-rwxr-xr-xcrypto/bn/asm/sparcv9a-mont.pl882
-rwxr-xr-xcrypto/bn/asm/via-mont.pl242
-rwxr-xr-xcrypto/bn/asm/x86-mont.pl591
-rw-r--r--crypto/bn/asm/x86_64-gcc.c18
12 files changed, 11 insertions, 5316 deletions
diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl
deleted file mode 100755
index 7a2cc3173b0e..000000000000
--- a/crypto/bn/asm/alpha-mont.pl
+++ /dev/null
@@ -1,317 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# On 21264 RSA sign performance improves by 70/35/20/15 percent for
-# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
-# instructed to '-tune host' code with in-line assembler. Other
-# benchmarks improve by 15-20%. To anchor it to something else, the
-# code provides approximately the same performance per GHz as AMD64.
-# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
-# difference.
-
-# int bn_mul_mont(
-$rp="a0"; # BN_ULONG *rp,
-$ap="a1"; # const BN_ULONG *ap,
-$bp="a2"; # const BN_ULONG *bp,
-$np="a3"; # const BN_ULONG *np,
-$n0="a4"; # const BN_ULONG *n0,
-$num="a5"; # int num);
-
-$lo0="t0";
-$hi0="t1";
-$lo1="t2";
-$hi1="t3";
-$aj="t4";
-$bi="t5";
-$nj="t6";
-$tp="t7";
-$alo="t8";
-$ahi="t9";
-$nlo="t10";
-$nhi="t11";
-$tj="t12";
-$i="s3";
-$j="s4";
-$m1="s5";
-
-$code=<<___;
-#include <asm.h>
-#include <regdef.h>
-
-.text
-
-.set noat
-.set noreorder
-
-.globl bn_mul_mont
-.align 5
-.ent bn_mul_mont
-bn_mul_mont:
- lda sp,-40(sp)
- stq ra,0(sp)
- stq s3,8(sp)
- stq s4,16(sp)
- stq s5,24(sp)
- stq fp,32(sp)
- mov sp,fp
- .mask 0x0400f000,-40
- .frame fp,40,ra
- .prologue 0
-
- .align 4
- .set reorder
- sextl $num,$num
- mov 0,v0
- cmplt $num,4,AT
- bne AT,.Lexit
-
- ldq $hi0,0($ap) # ap[0]
- s8addq $num,16,AT
- ldq $aj,8($ap)
- subq sp,AT,sp
- ldq $bi,0($bp) # bp[0]
- mov -4096,AT
- ldq $n0,0($n0)
- and sp,AT,sp
-
- mulq $hi0,$bi,$lo0
- ldq $hi1,0($np) # np[0]
- umulh $hi0,$bi,$hi0
- ldq $nj,8($np)
-
- mulq $lo0,$n0,$m1
-
- mulq $hi1,$m1,$lo1
- umulh $hi1,$m1,$hi1
-
- addq $lo1,$lo0,$lo1
- cmpult $lo1,$lo0,AT
- addq $hi1,AT,$hi1
-
- mulq $aj,$bi,$alo
- mov 2,$j
- umulh $aj,$bi,$ahi
- mov sp,$tp
-
- mulq $nj,$m1,$nlo
- s8addq $j,$ap,$aj
- umulh $nj,$m1,$nhi
- s8addq $j,$np,$nj
-.align 4
-.L1st:
- .set noreorder
- ldq $aj,($aj)
- addl $j,1,$j
- ldq $nj,($nj)
- lda $tp,8($tp)
-
- addq $alo,$hi0,$lo0
- mulq $aj,$bi,$alo
- cmpult $lo0,$hi0,AT
- addq $nlo,$hi1,$lo1
-
- mulq $nj,$m1,$nlo
- addq $ahi,AT,$hi0
- cmpult $lo1,$hi1,v0
- cmplt $j,$num,$tj
-
- umulh $aj,$bi,$ahi
- addq $nhi,v0,$hi1
- addq $lo1,$lo0,$lo1
- s8addq $j,$ap,$aj
-
- umulh $nj,$m1,$nhi
- cmpult $lo1,$lo0,v0
- addq $hi1,v0,$hi1
- s8addq $j,$np,$nj
-
- stq $lo1,-8($tp)
- nop
- unop
- bne $tj,.L1st
- .set reorder
-
- addq $alo,$hi0,$lo0
- addq $nlo,$hi1,$lo1
- cmpult $lo0,$hi0,AT
- cmpult $lo1,$hi1,v0
- addq $ahi,AT,$hi0
- addq $nhi,v0,$hi1
-
- addq $lo1,$lo0,$lo1
- cmpult $lo1,$lo0,v0
- addq $hi1,v0,$hi1
-
- stq $lo1,0($tp)
-
- addq $hi1,$hi0,$hi1
- cmpult $hi1,$hi0,AT
- stq $hi1,8($tp)
- stq AT,16($tp)
-
- mov 1,$i
-.align 4
-.Louter:
- s8addq $i,$bp,$bi
- ldq $hi0,($ap)
- ldq $aj,8($ap)
- ldq $bi,($bi)
- ldq $hi1,($np)
- ldq $nj,8($np)
- ldq $tj,(sp)
-
- mulq $hi0,$bi,$lo0
- umulh $hi0,$bi,$hi0
-
- addq $lo0,$tj,$lo0
- cmpult $lo0,$tj,AT
- addq $hi0,AT,$hi0
-
- mulq $lo0,$n0,$m1
-
- mulq $hi1,$m1,$lo1
- umulh $hi1,$m1,$hi1
-
- addq $lo1,$lo0,$lo1
- cmpult $lo1,$lo0,AT
- mov 2,$j
- addq $hi1,AT,$hi1
-
- mulq $aj,$bi,$alo
- mov sp,$tp
- umulh $aj,$bi,$ahi
-
- mulq $nj,$m1,$nlo
- s8addq $j,$ap,$aj
- umulh $nj,$m1,$nhi
-.align 4
-.Linner:
- .set noreorder
- ldq $tj,8($tp) #L0
- nop #U1
- ldq $aj,($aj) #L1
- s8addq $j,$np,$nj #U0
-
- ldq $nj,($nj) #L0
- nop #U1
- addq $alo,$hi0,$lo0 #L1
- lda $tp,8($tp)
-
- mulq $aj,$bi,$alo #U1
- cmpult $lo0,$hi0,AT #L0
- addq $nlo,$hi1,$lo1 #L1
- addl $j,1,$j
-
- mulq $nj,$m1,$nlo #U1
- addq $ahi,AT,$hi0 #L0
- addq $lo0,$tj,$lo0 #L1
- cmpult $lo1,$hi1,v0 #U0
-
- umulh $aj,$bi,$ahi #U1
- cmpult $lo0,$tj,AT #L0
- addq $lo1,$lo0,$lo1 #L1
- addq $nhi,v0,$hi1 #U0
-
- umulh $nj,$m1,$nhi #U1
- s8addq $j,$ap,$aj #L0
- cmpult $lo1,$lo0,v0 #L1
- cmplt $j,$num,$tj #U0 # borrow $tj
-
- addq $hi0,AT,$hi0 #L0
- addq $hi1,v0,$hi1 #U1
- stq $lo1,-8($tp) #L1
- bne $tj,.Linner #U0
- .set reorder
-
- ldq $tj,8($tp)
- addq $alo,$hi0,$lo0
- addq $nlo,$hi1,$lo1
- cmpult $lo0,$hi0,AT
- cmpult $lo1,$hi1,v0
- addq $ahi,AT,$hi0
- addq $nhi,v0,$hi1
-
- addq $lo0,$tj,$lo0
- cmpult $lo0,$tj,AT
- addq $hi0,AT,$hi0
-
- ldq $tj,16($tp)
- addq $lo1,$lo0,$j
- cmpult $j,$lo0,v0
- addq $hi1,v0,$hi1
-
- addq $hi1,$hi0,$lo1
- stq $j,($tp)
- cmpult $lo1,$hi0,$hi1
- addq $lo1,$tj,$lo1
- cmpult $lo1,$tj,AT
- addl $i,1,$i
- addq $hi1,AT,$hi1
- stq $lo1,8($tp)
- cmplt $i,$num,$tj # borrow $tj
- stq $hi1,16($tp)
- bne $tj,.Louter
-
- s8addq $num,sp,$tj # &tp[num]
- mov $rp,$bp # put rp aside
- mov sp,$tp
- mov sp,$ap
- mov 0,$hi0 # clear borrow bit
-
-.align 4
-.Lsub: ldq $lo0,($tp)
- ldq $lo1,($np)
- lda $tp,8($tp)
- lda $np,8($np)
- subq $lo0,$lo1,$lo1 # tp[i]-np[i]
- cmpult $lo0,$lo1,AT
- subq $lo1,$hi0,$lo0
- cmpult $lo1,$lo0,$hi0
- or $hi0,AT,$hi0
- stq $lo0,($rp)
- cmpult $tp,$tj,v0
- lda $rp,8($rp)
- bne v0,.Lsub
-
- subq $hi1,$hi0,$hi0 # handle upmost overflow bit
- mov sp,$tp
- mov $bp,$rp # restore rp
-
- and sp,$hi0,$ap
- bic $bp,$hi0,$bp
- bis $bp,$ap,$ap # ap=borrow?tp:rp
-
-.align 4
-.Lcopy: ldq $aj,($ap) # copy or in-place refresh
- lda $tp,8($tp)
- lda $rp,8($rp)
- lda $ap,8($ap)
- stq zero,-8($tp) # zap tp
- cmpult $tp,$tj,AT
- stq $aj,-8($rp)
- bne AT,.Lcopy
- mov 1,v0
-
-.Lexit:
- .set noreorder
- mov fp,sp
- /*ldq ra,0(sp)*/
- ldq s3,8(sp)
- ldq s4,16(sp)
- ldq s5,24(sp)
- ldq fp,32(sp)
- lda sp,40(sp)
- ret (ra)
-.end bn_mul_mont
-.rdata
-.asciiz "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
deleted file mode 100755
index 05d5dc1a4822..000000000000
--- a/crypto/bn/asm/armv4-mont.pl
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# January 2007.
-
-# Montgomery multiplication for ARMv4.
-#
-# Performance improvement naturally varies among CPU implementations
-# and compilers. The code was observed to provide +65-35% improvement
-# [depending on key length, less for longer keys] on ARM920T, and
-# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
-# base and compiler generated code with in-lined umull and even umlal
-# instructions. The latter means that this code didn't really have an
-# "advantage" of utilizing some "secret" instruction.
-#
-# The code is interoperable with Thumb ISA and is rather compact, less
-# than 1/2KB. Windows CE port would be trivial, as it's exclusively
-# about decorations, ABI and instruction syntax are identical.
-
-$num="r0"; # starts as num argument, but holds &tp[num-1]
-$ap="r1";
-$bp="r2"; $bi="r2"; $rp="r2";
-$np="r3";
-$tp="r4";
-$aj="r5";
-$nj="r6";
-$tj="r7";
-$n0="r8";
-########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
-$alo="r10"; # sl, gcc uses it to keep @GOT
-$ahi="r11"; # fp
-$nlo="r12"; # ip
-########### # r13 is stack pointer
-$nhi="r14"; # lr
-########### # r15 is program counter
-
-#### argument block layout relative to &tp[num-1], a.k.a. $num
-$_rp="$num,#12*4";
-# ap permanently resides in r1
-$_bp="$num,#13*4";
-# np permanently resides in r3
-$_n0="$num,#14*4";
-$_num="$num,#15*4"; $_bpend=$_num;
-
-$code=<<___;
-.text
-
-.global bn_mul_mont
-.type bn_mul_mont,%function
-
-.align 2
-bn_mul_mont:
- stmdb sp!,{r0,r2} @ sp points at argument block
- ldr $num,[sp,#3*4] @ load num
- cmp $num,#2
- movlt r0,#0
- addlt sp,sp,#2*4
- blt .Labrt
-
- stmdb sp!,{r4-r12,lr} @ save 10 registers
-
- mov $num,$num,lsl#2 @ rescale $num for byte count
- sub sp,sp,$num @ alloca(4*num)
- sub sp,sp,#4 @ +extra dword
- sub $num,$num,#4 @ "num=num-1"
- add $tp,$bp,$num @ &bp[num-1]
-
- add $num,sp,$num @ $num to point at &tp[num-1]
- ldr $n0,[$_n0] @ &n0
- ldr $bi,[$bp] @ bp[0]
- ldr $aj,[$ap],#4 @ ap[0],ap++
- ldr $nj,[$np],#4 @ np[0],np++
- ldr $n0,[$n0] @ *n0
- str $tp,[$_bpend] @ save &bp[num]
-
- umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
- str $n0,[$_n0] @ save n0 value
- mul $n0,$alo,$n0 @ "tp[0]"*n0
- mov $nlo,#0
- umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
- mov $tp,sp
-
-.L1st:
- ldr $aj,[$ap],#4 @ ap[j],ap++
- mov $alo,$ahi
- mov $ahi,#0
- umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
- ldr $nj,[$np],#4 @ np[j],np++
- mov $nhi,#0
- umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
- adds $nlo,$nlo,$alo
- str $nlo,[$tp],#4 @ tp[j-1]=,tp++
- adc $nlo,$nhi,#0
- cmp $tp,$num
- bne .L1st
-
- adds $nlo,$nlo,$ahi
- mov $nhi,#0
- adc $nhi,$nhi,#0
- ldr $tp,[$_bp] @ restore bp
- str $nlo,[$num] @ tp[num-1]=
- ldr $n0,[$_n0] @ restore n0
- str $nhi,[$num,#4] @ tp[num]=
-
-.Louter:
- sub $tj,$num,sp @ "original" $num-1 value
- sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
- sub $np,$np,$tj @ "rewind" np to &np[1]
- ldr $bi,[$tp,#4]! @ *(++bp)
- ldr $aj,[$ap,#-4] @ ap[0]
- ldr $nj,[$np,#-4] @ np[0]
- ldr $alo,[sp] @ tp[0]
- ldr $tj,[sp,#4] @ tp[1]
-
- mov $ahi,#0
- umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
- str $tp,[$_bp] @ save bp
- mul $n0,$alo,$n0
- mov $nlo,#0
- umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
- mov $tp,sp
-
-.Linner:
- ldr $aj,[$ap],#4 @ ap[j],ap++
- adds $alo,$ahi,$tj @ +=tp[j]
- mov $ahi,#0
- umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
- ldr $nj,[$np],#4 @ np[j],np++
- mov $nhi,#0
- umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
- ldr $tj,[$tp,#8] @ tp[j+1]
- adc $ahi,$ahi,#0
- adds $nlo,$nlo,$alo
- str $nlo,[$tp],#4 @ tp[j-1]=,tp++
- adc $nlo,$nhi,#0
- cmp $tp,$num
- bne .Linner
-
- adds $nlo,$nlo,$ahi
- mov $nhi,#0
- adc $nhi,$nhi,#0
- adds $nlo,$nlo,$tj
- adc $nhi,$nhi,#0
- ldr $tp,[$_bp] @ restore bp
- ldr $tj,[$_bpend] @ restore &bp[num]
- str $nlo,[$num] @ tp[num-1]=
- ldr $n0,[$_n0] @ restore n0
- str $nhi,[$num,#4] @ tp[num]=
-
- cmp $tp,$tj
- bne .Louter
-
- ldr $rp,[$_rp] @ pull rp
- add $num,$num,#4 @ $num to point at &tp[num]
- sub $aj,$num,sp @ "original" num value
- mov $tp,sp @ "rewind" $tp
- mov $ap,$tp @ "borrow" $ap
- sub $np,$np,$aj @ "rewind" $np to &np[0]
-
- subs $tj,$tj,$tj @ "clear" carry flag
-.Lsub: ldr $tj,[$tp],#4
- ldr $nj,[$np],#4
- sbcs $tj,$tj,$nj @ tp[j]-np[j]
- str $tj,[$rp],#4 @ rp[j]=
- teq $tp,$num @ preserve carry
- bne .Lsub
- sbcs $nhi,$nhi,#0 @ upmost carry
- mov $tp,sp @ "rewind" $tp
- sub $rp,$rp,$aj @ "rewind" $rp
-
- and $ap,$tp,$nhi
- bic $np,$rp,$nhi
- orr $ap,$ap,$np @ ap=borrow?tp:rp
-
-.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
- str sp,[$tp],#4 @ zap tp
- str $tj,[$rp],#4
- cmp $tp,$num
- bne .Lcopy
-
- add sp,$num,#4 @ skip over tp[num+1]
- ldmia sp!,{r4-r12,lr} @ restore registers
- add sp,sp,#2*4 @ skip over {r0,r2}
- mov r0,#1
-.Labrt: tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-.size bn_mul_mont,.-bn_mul_mont
-.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/mips3-mont.pl b/crypto/bn/asm/mips3-mont.pl
deleted file mode 100755
index 8f9156e02af3..000000000000
--- a/crypto/bn/asm/mips3-mont.pl
+++ /dev/null
@@ -1,327 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# This module doesn't present direct interest for OpenSSL, because it
-# doesn't provide better performance for longer keys. While 512-bit
-# RSA private key operations are 40% faster, 1024-bit ones are hardly
-# faster at all, while longer key operations are slower by up to 20%.
-# It might be of interest to embedded system developers though, as
-# it's smaller than 1KB, yet offers ~3x improvement over compiler
-# generated code.
-#
-# The module targets N32 and N64 MIPS ABIs and currently is a bit
-# IRIX-centric, i.e. is likely to require adaptation for other OSes.
-
-# int bn_mul_mont(
-$rp="a0"; # BN_ULONG *rp,
-$ap="a1"; # const BN_ULONG *ap,
-$bp="a2"; # const BN_ULONG *bp,
-$np="a3"; # const BN_ULONG *np,
-$n0="a4"; # const BN_ULONG *n0,
-$num="a5"; # int num);
-
-$lo0="a6";
-$hi0="a7";
-$lo1="v0";
-$hi1="v1";
-$aj="t0";
-$bi="t1";
-$nj="t2";
-$tp="t3";
-$alo="s0";
-$ahi="s1";
-$nlo="s2";
-$nhi="s3";
-$tj="s4";
-$i="s5";
-$j="s6";
-$fp="t8";
-$m1="t9";
-
-$FRAME=8*(2+8);
-
-$code=<<___;
-#include <asm.h>
-#include <regdef.h>
-
-.text
-
-.set noat
-.set reorder
-
-.align 5
-.globl bn_mul_mont
-.ent bn_mul_mont
-bn_mul_mont:
- .set noreorder
- PTR_SUB sp,64
- move $fp,sp
- .frame $fp,64,ra
- slt AT,$num,4
- li v0,0
- beqzl AT,.Lproceed
- nop
- jr ra
- PTR_ADD sp,$fp,64
- .set reorder
-.align 5
-.Lproceed:
- ld $n0,0($n0)
- ld $bi,0($bp) # bp[0]
- ld $aj,0($ap) # ap[0]
- ld $nj,0($np) # np[0]
- PTR_SUB sp,16 # place for two extra words
- sll $num,3
- li AT,-4096
- PTR_SUB sp,$num
- and sp,AT
-
- sd s0,0($fp)
- sd s1,8($fp)
- sd s2,16($fp)
- sd s3,24($fp)
- sd s4,32($fp)
- sd s5,40($fp)
- sd s6,48($fp)
- sd s7,56($fp)
-
- dmultu $aj,$bi
- ld $alo,8($ap)
- ld $nlo,8($np)
- mflo $lo0
- mfhi $hi0
- dmultu $lo0,$n0
- mflo $m1
-
- dmultu $alo,$bi
- mflo $alo
- mfhi $ahi
-
- dmultu $nj,$m1
- mflo $lo1
- mfhi $hi1
- dmultu $nlo,$m1
- daddu $lo1,$lo0
- sltu AT,$lo1,$lo0
- daddu $hi1,AT
- mflo $nlo
- mfhi $nhi
-
- move $tp,sp
- li $j,16
-.align 4
-.L1st:
- .set noreorder
- PTR_ADD $aj,$ap,$j
- ld $aj,($aj)
- PTR_ADD $nj,$np,$j
- ld $nj,($nj)
-
- dmultu $aj,$bi
- daddu $lo0,$alo,$hi0
- daddu $lo1,$nlo,$hi1
- sltu AT,$lo0,$hi0
- sltu s7,$lo1,$hi1
- daddu $hi0,$ahi,AT
- daddu $hi1,$nhi,s7
- mflo $alo
- mfhi $ahi
-
- daddu $lo1,$lo0
- sltu AT,$lo1,$lo0
- dmultu $nj,$m1
- daddu $hi1,AT
- addu $j,8
- sd $lo1,($tp)
- sltu s7,$j,$num
- mflo $nlo
- mfhi $nhi
-
- bnez s7,.L1st
- PTR_ADD $tp,8
- .set reorder
-
- daddu $lo0,$alo,$hi0
- sltu AT,$lo0,$hi0
- daddu $hi0,$ahi,AT
-
- daddu $lo1,$nlo,$hi1
- sltu s7,$lo1,$hi1
- daddu $hi1,$nhi,s7
- daddu $lo1,$lo0
- sltu AT,$lo1,$lo0
- daddu $hi1,AT
-
- sd $lo1,($tp)
-
- daddu $hi1,$hi0
- sltu AT,$hi1,$hi0
- sd $hi1,8($tp)
- sd AT,16($tp)
-
- li $i,8
-.align 4
-.Louter:
- PTR_ADD $bi,$bp,$i
- ld $bi,($bi)
- ld $aj,($ap)
- ld $alo,8($ap)
- ld $tj,(sp)
-
- dmultu $aj,$bi
- ld $nj,($np)
- ld $nlo,8($np)
- mflo $lo0
- mfhi $hi0
- daddu $lo0,$tj
- dmultu $lo0,$n0
- sltu AT,$lo0,$tj
- daddu $hi0,AT
- mflo $m1
-
- dmultu $alo,$bi
- mflo $alo
- mfhi $ahi
-
- dmultu $nj,$m1
- mflo $lo1
- mfhi $hi1
-
- dmultu $nlo,$m1
- daddu $lo1,$lo0
- sltu AT,$lo1,$lo0
- daddu $hi1,AT
- mflo $nlo
- mfhi $nhi
-
- move $tp,sp
- li $j,16
- ld $tj,8($tp)
-.align 4
-.Linner:
- .set noreorder
- PTR_ADD $aj,$ap,$j
- ld $aj,($aj)
- PTR_ADD $nj,$np,$j
- ld $nj,($nj)
-
- dmultu $aj,$bi
- daddu $lo0,$alo,$hi0
- daddu $lo1,$nlo,$hi1
- sltu AT,$lo0,$hi0
- sltu s7,$lo1,$hi1
- daddu $hi0,$ahi,AT
- daddu $hi1,$nhi,s7
- mflo $alo
- mfhi $ahi
-
- daddu $lo0,$tj
- addu $j,8
- dmultu $nj,$m1
- sltu AT,$lo0,$tj
- daddu $lo1,$lo0
- daddu $hi0,AT
- sltu s7,$lo1,$lo0
- ld $tj,16($tp)
- daddu $hi1,s7
- sltu AT,$j,$num
- mflo $nlo
- mfhi $nhi
- sd $lo1,($tp)
- bnez AT,.Linner
- PTR_ADD $tp,8
- .set reorder
-
- daddu $lo0,$alo,$hi0
- sltu AT,$lo0,$hi0
- daddu $hi0,$ahi,AT
- daddu $lo0,$tj
- sltu s7,$lo0,$tj
- daddu $hi0,s7
-
- ld $tj,16($tp)
- daddu $lo1,$nlo,$hi1
- sltu AT,$lo1,$hi1
- daddu $hi1,$nhi,AT
- daddu $lo1,$lo0
- sltu s7,$lo1,$lo0
- daddu $hi1,s7
- sd $lo1,($tp)
-
- daddu $lo1,$hi1,$hi0
- sltu $hi1,$lo1,$hi0
- daddu $lo1,$tj
- sltu AT,$lo1,$tj
- daddu $hi1,AT
- sd $lo1,8($tp)
- sd $hi1,16($tp)
-
- addu $i,8
- sltu s7,$i,$num
- bnez s7,.Louter
-
- .set noreorder
- PTR_ADD $tj,sp,$num # &tp[num]
- move $tp,sp
- move $ap,sp
- li $hi0,0 # clear borrow bit
-
-.align 4
-.Lsub: ld $lo0,($tp)
- ld $lo1,($np)
- PTR_ADD $tp,8
- PTR_ADD $np,8
- dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
- sgtu AT,$lo1,$lo0
- dsubu $lo0,$lo1,$hi0
- sgtu $hi0,$lo0,$lo1
- sd $lo0,($rp)
- or $hi0,AT
- sltu AT,$tp,$tj
- bnez AT,.Lsub
- PTR_ADD $rp,8
-
- dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
- move $tp,sp
- PTR_SUB $rp,$num # restore rp
- not $hi1,$hi0
-
- and $ap,$hi0,sp
- and $bp,$hi1,$rp
- or $ap,$ap,$bp # ap=borrow?tp:rp
-
-.align 4
-.Lcopy: ld $aj,($ap)
- PTR_ADD $ap,8
- PTR_ADD $tp,8
- sd zero,-8($tp)
- sltu AT,$tp,$tj
- sd $aj,($rp)
- bnez AT,.Lcopy
- PTR_ADD $rp,8
-
- ld s0,0($fp)
- ld s1,8($fp)
- ld s2,16($fp)
- ld s3,24($fp)
- ld s4,32($fp)
- ld s5,40($fp)
- ld s6,48($fp)
- ld s7,56($fp)
- li v0,1
- jr ra
- PTR_ADD sp,$fp,64
- .set reorder
-END(bn_mul_mont)
-.rdata
-.asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl
deleted file mode 100755
index 7849eae95922..000000000000
--- a/crypto/bn/asm/ppc-mont.pl
+++ /dev/null
@@ -1,323 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# April 2006
-
-# "Teaser" Montgomery multiplication module for PowerPC. It's possible
-# to gain a bit more by modulo-scheduling outer loop, then dedicated
-# squaring procedure should give further 20% and code can be adapted
-# for 32-bit application running on 64-bit CPU. As for the latter.
-# It won't be able to achieve "native" 64-bit performance, because in
-# 32-bit application context every addc instruction will have to be
-# expanded as addc, twice right shift by 32 and finally adde, etc.
-# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
-# for 64-bit application running on PPC970/G5 is:
-#
-# 512-bit +65%
-# 1024-bit +35%
-# 2048-bit +18%
-# 4096-bit +4%
-
-$flavour = shift;
-
-if ($flavour =~ /32/) {
- $BITS= 32;
- $BNSZ= $BITS/8;
- $SIZE_T=4;
- $RZONE= 224;
- $FRAME= $SIZE_T*16;
-
- $LD= "lwz"; # load
- $LDU= "lwzu"; # load and update
- $LDX= "lwzx"; # load indexed
- $ST= "stw"; # store
- $STU= "stwu"; # store and update
- $STX= "stwx"; # store indexed
- $STUX= "stwux"; # store indexed and update
- $UMULL= "mullw"; # unsigned multiply low
- $UMULH= "mulhwu"; # unsigned multiply high
- $UCMP= "cmplw"; # unsigned compare
- $SHRI= "srwi"; # unsigned shift right by immediate
- $PUSH= $ST;
- $POP= $LD;
-} elsif ($flavour =~ /64/) {
- $BITS= 64;
- $BNSZ= $BITS/8;
- $SIZE_T=8;
- $RZONE= 288;
- $FRAME= $SIZE_T*16;
-
- # same as above, but 64-bit mnemonics...
- $LD= "ld"; # load
- $LDU= "ldu"; # load and update
- $LDX= "ldx"; # load indexed
- $ST= "std"; # store
- $STU= "stdu"; # store and update
- $STX= "stdx"; # store indexed
- $STUX= "stdux"; # store indexed and update
- $UMULL= "mulld"; # unsigned multiply low
- $UMULH= "mulhdu"; # unsigned multiply high
- $UCMP= "cmpld"; # unsigned compare
- $SHRI= "srdi"; # unsigned shift right by immediate
- $PUSH= $ST;
- $POP= $LD;
-} else { die "nonsense $flavour"; }
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
-die "can't locate ppc-xlate.pl";
-
-open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-
-$sp="r1";
-$toc="r2";
-$rp="r3"; $ovf="r3";
-$ap="r4";
-$bp="r5";
-$np="r6";
-$n0="r7";
-$num="r8";
-$rp="r9"; # $rp is reassigned
-$aj="r10";
-$nj="r11";
-$tj="r12";
-# non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
-#
-$nhi="r0";
-
-$code=<<___;
-.machine "any"
-.text
-
-.globl .bn_mul_mont
-.align 4
-.bn_mul_mont:
- cmpwi $num,4
- mr $rp,r3 ; $rp is reassigned
- li r3,0
- bltlr
-
- slwi $num,$num,`log($BNSZ)/log(2)`
- li $tj,-4096
- addi $ovf,$num,`$FRAME+$RZONE`
- subf $ovf,$ovf,$sp ; $sp-$ovf
- and $ovf,$ovf,$tj ; minimize TLB usage
- subf $ovf,$sp,$ovf ; $ovf-$sp
- srwi $num,$num,`log($BNSZ)/log(2)`
- $STUX $sp,$sp,$ovf
-
- $PUSH r14,`4*$SIZE_T`($sp)
- $PUSH r15,`5*$SIZE_T`($sp)
- $PUSH r16,`6*$SIZE_T`($sp)
- $PUSH r17,`7*$SIZE_T`($sp)
- $PUSH r18,`8*$SIZE_T`($sp)
- $PUSH r19,`9*$SIZE_T`($sp)
- $PUSH r20,`10*$SIZE_T`($sp)
- $PUSH r21,`11*$SIZE_T`($sp)
- $PUSH r22,`12*$SIZE_T`($sp)
- $PUSH r23,`13*$SIZE_T`($sp)
- $PUSH r24,`14*$SIZE_T`($sp)
- $PUSH r25,`15*$SIZE_T`($sp)
-
- $LD $n0,0($n0) ; pull n0[0] value
- addi $num,$num,-2 ; adjust $num for counter register
-
- $LD $m0,0($bp) ; m0=bp[0]
- $LD $aj,0($ap) ; ap[0]
- addi $tp,$sp,$FRAME
- $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
- $UMULH $hi0,$aj,$m0
-
- $LD $aj,$BNSZ($ap) ; ap[1]
- $LD $nj,0($np) ; np[0]
-
- $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
-
- $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
- $UMULH $ahi,$aj,$m0
-
- $UMULL $lo1,$nj,$m1 ; np[0]*m1
- $UMULH $hi1,$nj,$m1
- $LD $nj,$BNSZ($np) ; np[1]
- addc $lo1,$lo1,$lo0
- addze $hi1,$hi1
-
- $UMULL $nlo,$nj,$m1 ; np[1]*m1
- $UMULH $nhi,$nj,$m1
-
- mtctr $num
- li $j,`2*$BNSZ`
-.align 4
-L1st:
- $LDX $aj,$ap,$j ; ap[j]
- addc $lo0,$alo,$hi0
- $LDX $nj,$np,$j ; np[j]
- addze $hi0,$ahi
- $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
- addc $lo1,$nlo,$hi1
- $UMULH $ahi,$aj,$m0
- addze $hi1,$nhi
- $UMULL $nlo,$nj,$m1 ; np[j]*m1
- addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
- $UMULH $nhi,$nj,$m1
- addze $hi1,$hi1
- $ST $lo1,0($tp) ; tp[j-1]
-
- addi $j,$j,$BNSZ ; j++
- addi $tp,$tp,$BNSZ ; tp++
- bdnz- L1st
-;L1st
- addc $lo0,$alo,$hi0
- addze $hi0,$ahi
-
- addc $lo1,$nlo,$hi1
- addze $hi1,$nhi
- addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
- addze $hi1,$hi1
- $ST $lo1,0($tp) ; tp[j-1]
-
- li $ovf,0
- addc $hi1,$hi1,$hi0
- addze $ovf,$ovf ; upmost overflow bit
- $ST $hi1,$BNSZ($tp)
-
- li $i,$BNSZ
-.align 4
-Louter:
- $LDX $m0,$bp,$i ; m0=bp[i]
- $LD $aj,0($ap) ; ap[0]
- addi $tp,$sp,$FRAME
- $LD $tj,$FRAME($sp) ; tp[0]
- $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
- $UMULH $hi0,$aj,$m0
- $LD $aj,$BNSZ($ap) ; ap[1]
- $LD $nj,0($np) ; np[0]
- addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
- $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
- addze $hi0,$hi0
- $UMULL $m1,$lo0,$n0 ; tp[0]*n0
- $UMULH $ahi,$aj,$m0
- $UMULL $lo1,$nj,$m1 ; np[0]*m1
- $UMULH $hi1,$nj,$m1
- $LD $nj,$BNSZ($np) ; np[1]
- addc $lo1,$lo1,$lo0
- $UMULL $nlo,$nj,$m1 ; np[1]*m1
- addze $hi1,$hi1
- $UMULH $nhi,$nj,$m1
-
- mtctr $num
- li $j,`2*$BNSZ`
-.align 4
-Linner:
- $LDX $aj,$ap,$j ; ap[j]
- addc $lo0,$alo,$hi0
- $LD $tj,$BNSZ($tp) ; tp[j]
- addze $hi0,$ahi
- $LDX $nj,$np,$j ; np[j]
- addc $lo1,$nlo,$hi1
- $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
- addze $hi1,$nhi
- $UMULH $ahi,$aj,$m0
- addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
- $UMULL $nlo,$nj,$m1 ; np[j]*m1
- addze $hi0,$hi0
- $UMULH $nhi,$nj,$m1
- addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
- addi $j,$j,$BNSZ ; j++
- addze $hi1,$hi1
- $ST $lo1,0($tp) ; tp[j-1]
- addi $tp,$tp,$BNSZ ; tp++
- bdnz- Linner
-;Linner
- $LD $tj,$BNSZ($tp) ; tp[j]
- addc $lo0,$alo,$hi0
- addze $hi0,$ahi
- addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
- addze $hi0,$hi0
-
- addc $lo1,$nlo,$hi1
- addze $hi1,$nhi
- addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
- addze $hi1,$hi1
- $ST $lo1,0($tp) ; tp[j-1]
-
- addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
- li $ovf,0
- adde $hi1,$hi1,$hi0
- addze $ovf,$ovf
- $ST $hi1,$BNSZ($tp)
-;
- slwi $tj,$num,`log($BNSZ)/log(2)`
- $UCMP $i,$tj
- addi $i,$i,$BNSZ
- ble- Louter
-
- addi $num,$num,2 ; restore $num
- subfc $j,$j,$j ; j=0 and "clear" XER[CA]
- addi $tp,$sp,$FRAME
- mtctr $num
-
-.align 4
-Lsub: $LDX $tj,$tp,$j
- $LDX $nj,$np,$j
- subfe $aj,$nj,$tj ; tp[j]-np[j]
- $STX $aj,$rp,$j
- addi $j,$j,$BNSZ
- bdnz- Lsub
-
- li $j,0
- mtctr $num
- subfe $ovf,$j,$ovf ; handle upmost overflow bit
- and $ap,$tp,$ovf
- andc $np,$rp,$ovf
- or $ap,$ap,$np ; ap=borrow?tp:rp
-
-.align 4
-Lcopy: ; copy or in-place refresh
- $LDX $tj,$ap,$j
- $STX $tj,$rp,$j
- $STX $j,$tp,$j ; zap at once
- addi $j,$j,$BNSZ
- bdnz- Lcopy
-
- $POP r14,`4*$SIZE_T`($sp)
- $POP r15,`5*$SIZE_T`($sp)
- $POP r16,`6*$SIZE_T`($sp)
- $POP r17,`7*$SIZE_T`($sp)
- $POP r18,`8*$SIZE_T`($sp)
- $POP r19,`9*$SIZE_T`($sp)
- $POP r20,`10*$SIZE_T`($sp)
- $POP r21,`11*$SIZE_T`($sp)
- $POP r22,`12*$SIZE_T`($sp)
- $POP r23,`13*$SIZE_T`($sp)
- $POP r24,`14*$SIZE_T`($sp)
- $POP r25,`15*$SIZE_T`($sp)
- $POP $sp,0($sp)
- li r3,1
- blr
- .long 0
-.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl
deleted file mode 100755
index 3449b35855da..000000000000
--- a/crypto/bn/asm/ppc64-mont.pl
+++ /dev/null
@@ -1,918 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# December 2007
-
-# The reason for undertaken effort is basically following. Even though
-# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
-# performance was observed to be less than impressive, essentially as
-# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
-# Well, it's not surprising that IBM had to make some sacrifices to
-# boost the clock frequency that much, but no overall improvement?
-# Having observed how much difference did switching to FPU make on
-# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
-# Unfortunately the resulting performance improvement is not as
-# impressive, ~30%, and in absolute terms is still very far from what
-# one would expect from 4.7GHz CPU. There is a chance that I'm doing
-# something wrong, but in the lack of assembler level micro-profiling
-# data or at least decent platform guide I can't tell... Or better
-# results might be achieved with VMX... Anyway, this module provides
-# *worse* performance on other PowerPC implementations, ~40-15% slower
-# on PPC970 depending on key length and ~40% slower on Power 5 for all
-# key lengths. As it's obviously inappropriate as "best all-round"
-# alternative, it has to be complemented with run-time CPU family
-# detection. Oh! It should also be noted that unlike other PowerPC
-# implementation IALU ppc-mont.pl module performs *suboptimaly* on
-# >=1024-bit key lengths on Power 6. It should also be noted that
-# *everything* said so far applies to 64-bit builds! As far as 32-bit
-# application executed on 64-bit CPU goes, this module is likely to
-# become preferred choice, because it's easy to adapt it for such
-# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
-
-# February 2008
-
-# Micro-profiling assisted optimization results in ~15% improvement
-# over original ppc64-mont.pl version, or overall ~50% improvement
-# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
-# Power 6 CPU, this module is 5-150% faster depending on key length,
-# [hereafter] more for longer keys. But if compared to ppc-mont.pl
-# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
-# in absolute terms, but it's apparently the way Power 6 is...
-
-$flavour = shift;
-
-if ($flavour =~ /32/) {
- $SIZE_T=4;
- $RZONE= 224;
- $FRAME= $SIZE_T*12+8*12;
- $fname= "bn_mul_mont_ppc64";
-
- $STUX= "stwux"; # store indexed and update
- $PUSH= "stw";
- $POP= "lwz";
- die "not implemented yet";
-} elsif ($flavour =~ /64/) {
- $SIZE_T=8;
- $RZONE= 288;
- $FRAME= $SIZE_T*12+8*12;
- $fname= "bn_mul_mont";
-
- # same as above, but 64-bit mnemonics...
- $STUX= "stdux"; # store indexed and update
- $PUSH= "std";
- $POP= "ld";
-} else { die "nonsense $flavour"; }
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
-die "can't locate ppc-xlate.pl";
-
-open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
-
-$FRAME=($FRAME+63)&~63;
-$TRANSFER=16*8;
-
-$carry="r0";
-$sp="r1";
-$toc="r2";
-$rp="r3"; $ovf="r3";
-$ap="r4";
-$bp="r5";
-$np="r6";
-$n0="r7";
-$num="r8";
-$rp="r9"; # $rp is reassigned
-$tp="r10";
-$j="r11";
-$i="r12";
-# non-volatile registers
-$nap_d="r14"; # interleaved ap and np in double format
-$a0="r15"; # ap[0]
-$t0="r16"; # temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
-
-# PPC offers enough register bank capacity to unroll inner loops twice
-#
-# ..A3A2A1A0
-# dcba
-# -----------
-# A0a
-# A0b
-# A0c
-# A0d
-# A1a
-# A1b
-# A1c
-# A1d
-# A2a
-# A2b
-# A2c
-# A2d
-# A3a
-# A3b
-# A3c
-# A3d
-# ..a
-# ..b
-#
-$ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
-$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
-$dota="f8"; $dotb="f9";
-$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
-$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
-$T0a="f18"; $T0b="f19";
-$T1a="f20"; $T1b="f21";
-$T2a="f22"; $T2b="f23";
-$T3a="f24"; $T3b="f25";
-
-# sp----------->+-------------------------------+
-# | saved sp |
-# +-------------------------------+
-# | |
-# +-------------------------------+
-# | 10 saved gpr, r14-r23 |
-# . .
-# . .
-# +12*size_t +-------------------------------+
-# | 12 saved fpr, f14-f25 |
-# . .
-# . .
-# +12*8 +-------------------------------+
-# | padding to 64 byte boundary |
-# . .
-# +X +-------------------------------+
-# | 16 gpr<->fpr transfer zone |
-# . .
-# . .
-# +16*8 +-------------------------------+
-# | __int64 tmp[-1] |
-# +-------------------------------+
-# | __int64 tmp[num] |
-# . .
-# . .
-# . .
-# +(num+1)*8 +-------------------------------+
-# | padding to 64 byte boundary |
-# . .
-# +X +-------------------------------+
-# | double nap_d[4*num] |
-# . .
-# . .
-# . .
-# +-------------------------------+
-
-$code=<<___;
-.machine "any"
-.text
-
-.globl .$fname
-.align 5
-.$fname:
- cmpwi $num,4
- mr $rp,r3 ; $rp is reassigned
- li r3,0 ; possible "not handled" return code
- bltlr-
- andi. r0,$num,1 ; $num has to be even
- bnelr-
-
- slwi $num,$num,3 ; num*=8
- li $i,-4096
- slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
- add $tp,$tp,$num ; place for tp[num+1]
- addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
- subf $tp,$tp,$sp ; $sp-$tp
- and $tp,$tp,$i ; minimize TLB usage
- subf $tp,$sp,$tp ; $tp-$sp
- $STUX $sp,$sp,$tp ; alloca
-
- $PUSH r14,`2*$SIZE_T`($sp)
- $PUSH r15,`3*$SIZE_T`($sp)
- $PUSH r16,`4*$SIZE_T`($sp)
- $PUSH r17,`5*$SIZE_T`($sp)
- $PUSH r18,`6*$SIZE_T`($sp)
- $PUSH r19,`7*$SIZE_T`($sp)
- $PUSH r20,`8*$SIZE_T`($sp)
- $PUSH r21,`9*$SIZE_T`($sp)
- $PUSH r22,`10*$SIZE_T`($sp)
- $PUSH r23,`11*$SIZE_T`($sp)
- stfd f14,`12*$SIZE_T+0`($sp)
- stfd f15,`12*$SIZE_T+8`($sp)
- stfd f16,`12*$SIZE_T+16`($sp)
- stfd f17,`12*$SIZE_T+24`($sp)
- stfd f18,`12*$SIZE_T+32`($sp)
- stfd f19,`12*$SIZE_T+40`($sp)
- stfd f20,`12*$SIZE_T+48`($sp)
- stfd f21,`12*$SIZE_T+56`($sp)
- stfd f22,`12*$SIZE_T+64`($sp)
- stfd f23,`12*$SIZE_T+72`($sp)
- stfd f24,`12*$SIZE_T+80`($sp)
- stfd f25,`12*$SIZE_T+88`($sp)
-
- ld $a0,0($ap) ; pull ap[0] value
- ld $n0,0($n0) ; pull n0[0] value
- ld $t3,0($bp) ; bp[0]
-
- addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
- li $i,-64
- add $nap_d,$tp,$num
- and $nap_d,$nap_d,$i ; align to 64 bytes
-
- mulld $t7,$a0,$t3 ; ap[0]*bp[0]
- ; nap_d is off by 1, because it's used with stfdu/lfdu
- addi $nap_d,$nap_d,-8
- srwi $j,$num,`3+1` ; counter register, num/2
- mulld $t7,$t7,$n0 ; tp[0]*n0
- addi $j,$j,-1
- addi $tp,$sp,`$FRAME+$TRANSFER-8`
- li $carry,0
- mtctr $j
-
- ; transfer bp[0] to FPU as 4x16-bit values
- extrdi $t0,$t3,16,48
- extrdi $t1,$t3,16,32
- extrdi $t2,$t3,16,16
- extrdi $t3,$t3,16,0
- std $t0,`$FRAME+0`($sp)
- std $t1,`$FRAME+8`($sp)
- std $t2,`$FRAME+16`($sp)
- std $t3,`$FRAME+24`($sp)
- ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
- extrdi $t4,$t7,16,48
- extrdi $t5,$t7,16,32
- extrdi $t6,$t7,16,16
- extrdi $t7,$t7,16,0
- std $t4,`$FRAME+32`($sp)
- std $t5,`$FRAME+40`($sp)
- std $t6,`$FRAME+48`($sp)
- std $t7,`$FRAME+56`($sp)
- lwz $t0,4($ap) ; load a[j] as 32-bit word pair
- lwz $t1,0($ap)
- lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
- lwz $t3,8($ap)
- lwz $t4,4($np) ; load n[j] as 32-bit word pair
- lwz $t5,0($np)
- lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
- lwz $t7,8($np)
- lfd $ba,`$FRAME+0`($sp)
- lfd $bb,`$FRAME+8`($sp)
- lfd $bc,`$FRAME+16`($sp)
- lfd $bd,`$FRAME+24`($sp)
- lfd $na,`$FRAME+32`($sp)
- lfd $nb,`$FRAME+40`($sp)
- lfd $nc,`$FRAME+48`($sp)
- lfd $nd,`$FRAME+56`($sp)
- std $t0,`$FRAME+64`($sp)
- std $t1,`$FRAME+72`($sp)
- std $t2,`$FRAME+80`($sp)
- std $t3,`$FRAME+88`($sp)
- std $t4,`$FRAME+96`($sp)
- std $t5,`$FRAME+104`($sp)
- std $t6,`$FRAME+112`($sp)
- std $t7,`$FRAME+120`($sp)
- fcfid $ba,$ba
- fcfid $bb,$bb
- fcfid $bc,$bc
- fcfid $bd,$bd
- fcfid $na,$na
- fcfid $nb,$nb
- fcfid $nc,$nc
- fcfid $nd,$nd
-
- lfd $A0,`$FRAME+64`($sp)
- lfd $A1,`$FRAME+72`($sp)
- lfd $A2,`$FRAME+80`($sp)
- lfd $A3,`$FRAME+88`($sp)
- lfd $N0,`$FRAME+96`($sp)
- lfd $N1,`$FRAME+104`($sp)
- lfd $N2,`$FRAME+112`($sp)
- lfd $N3,`$FRAME+120`($sp)
- fcfid $A0,$A0
- fcfid $A1,$A1
- fcfid $A2,$A2
- fcfid $A3,$A3
- fcfid $N0,$N0
- fcfid $N1,$N1
- fcfid $N2,$N2
- fcfid $N3,$N3
- addi $ap,$ap,16
- addi $np,$np,16
-
- fmul $T1a,$A1,$ba
- fmul $T1b,$A1,$bb
- stfd $A0,8($nap_d) ; save a[j] in double format
- stfd $A1,16($nap_d)
- fmul $T2a,$A2,$ba
- fmul $T2b,$A2,$bb
- stfd $A2,24($nap_d) ; save a[j+1] in double format
- stfd $A3,32($nap_d)
- fmul $T3a,$A3,$ba
- fmul $T3b,$A3,$bb
- stfd $N0,40($nap_d) ; save n[j] in double format
- stfd $N1,48($nap_d)
- fmul $T0a,$A0,$ba
- fmul $T0b,$A0,$bb
- stfd $N2,56($nap_d) ; save n[j+1] in double format
- stfdu $N3,64($nap_d)
-
- fmadd $T1a,$A0,$bc,$T1a
- fmadd $T1b,$A0,$bd,$T1b
- fmadd $T2a,$A1,$bc,$T2a
- fmadd $T2b,$A1,$bd,$T2b
- fmadd $T3a,$A2,$bc,$T3a
- fmadd $T3b,$A2,$bd,$T3b
- fmul $dota,$A3,$bc
- fmul $dotb,$A3,$bd
-
- fmadd $T1a,$N1,$na,$T1a
- fmadd $T1b,$N1,$nb,$T1b
- fmadd $T2a,$N2,$na,$T2a
- fmadd $T2b,$N2,$nb,$T2b
- fmadd $T3a,$N3,$na,$T3a
- fmadd $T3b,$N3,$nb,$T3b
- fmadd $T0a,$N0,$na,$T0a
- fmadd $T0b,$N0,$nb,$T0b
-
- fmadd $T1a,$N0,$nc,$T1a
- fmadd $T1b,$N0,$nd,$T1b
- fmadd $T2a,$N1,$nc,$T2a
- fmadd $T2b,$N1,$nd,$T2b
- fmadd $T3a,$N2,$nc,$T3a
- fmadd $T3b,$N2,$nd,$T3b
- fmadd $dota,$N3,$nc,$dota
- fmadd $dotb,$N3,$nd,$dotb
-
- fctid $T0a,$T0a
- fctid $T0b,$T0b
- fctid $T1a,$T1a
- fctid $T1b,$T1b
- fctid $T2a,$T2a
- fctid $T2b,$T2b
- fctid $T3a,$T3a
- fctid $T3b,$T3b
-
- stfd $T0a,`$FRAME+0`($sp)
- stfd $T0b,`$FRAME+8`($sp)
- stfd $T1a,`$FRAME+16`($sp)
- stfd $T1b,`$FRAME+24`($sp)
- stfd $T2a,`$FRAME+32`($sp)
- stfd $T2b,`$FRAME+40`($sp)
- stfd $T3a,`$FRAME+48`($sp)
- stfd $T3b,`$FRAME+56`($sp)
-
-.align 5
-L1st:
- lwz $t0,4($ap) ; load a[j] as 32-bit word pair
- lwz $t1,0($ap)
- lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
- lwz $t3,8($ap)
- lwz $t4,4($np) ; load n[j] as 32-bit word pair
- lwz $t5,0($np)
- lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
- lwz $t7,8($np)
- std $t0,`$FRAME+64`($sp)
- std $t1,`$FRAME+72`($sp)
- std $t2,`$FRAME+80`($sp)
- std $t3,`$FRAME+88`($sp)
- std $t4,`$FRAME+96`($sp)
- std $t5,`$FRAME+104`($sp)
- std $t6,`$FRAME+112`($sp)
- std $t7,`$FRAME+120`($sp)
- ld $t0,`$FRAME+0`($sp)
- ld $t1,`$FRAME+8`($sp)
- ld $t2,`$FRAME+16`($sp)
- ld $t3,`$FRAME+24`($sp)
- ld $t4,`$FRAME+32`($sp)
- ld $t5,`$FRAME+40`($sp)
- ld $t6,`$FRAME+48`($sp)
- ld $t7,`$FRAME+56`($sp)
- lfd $A0,`$FRAME+64`($sp)
- lfd $A1,`$FRAME+72`($sp)
- lfd $A2,`$FRAME+80`($sp)
- lfd $A3,`$FRAME+88`($sp)
- lfd $N0,`$FRAME+96`($sp)
- lfd $N1,`$FRAME+104`($sp)
- lfd $N2,`$FRAME+112`($sp)
- lfd $N3,`$FRAME+120`($sp)
- fcfid $A0,$A0
- fcfid $A1,$A1
- fcfid $A2,$A2
- fcfid $A3,$A3
- fcfid $N0,$N0
- fcfid $N1,$N1
- fcfid $N2,$N2
- fcfid $N3,$N3
- addi $ap,$ap,16
- addi $np,$np,16
-
- fmul $T1a,$A1,$ba
- fmul $T1b,$A1,$bb
- fmul $T2a,$A2,$ba
- fmul $T2b,$A2,$bb
- stfd $A0,8($nap_d) ; save a[j] in double format
- stfd $A1,16($nap_d)
- fmul $T3a,$A3,$ba
- fmul $T3b,$A3,$bb
- fmadd $T0a,$A0,$ba,$dota
- fmadd $T0b,$A0,$bb,$dotb
- stfd $A2,24($nap_d) ; save a[j+1] in double format
- stfd $A3,32($nap_d)
-
- fmadd $T1a,$A0,$bc,$T1a
- fmadd $T1b,$A0,$bd,$T1b
- fmadd $T2a,$A1,$bc,$T2a
- fmadd $T2b,$A1,$bd,$T2b
- stfd $N0,40($nap_d) ; save n[j] in double format
- stfd $N1,48($nap_d)
- fmadd $T3a,$A2,$bc,$T3a
- fmadd $T3b,$A2,$bd,$T3b
- add $t0,$t0,$carry ; can not overflow
- fmul $dota,$A3,$bc
- fmul $dotb,$A3,$bd
- stfd $N2,56($nap_d) ; save n[j+1] in double format
- stfdu $N3,64($nap_d)
- srdi $carry,$t0,16
- add $t1,$t1,$carry
- srdi $carry,$t1,16
-
- fmadd $T1a,$N1,$na,$T1a
- fmadd $T1b,$N1,$nb,$T1b
- insrdi $t0,$t1,16,32
- fmadd $T2a,$N2,$na,$T2a
- fmadd $T2b,$N2,$nb,$T2b
- add $t2,$t2,$carry
- fmadd $T3a,$N3,$na,$T3a
- fmadd $T3b,$N3,$nb,$T3b
- srdi $carry,$t2,16
- fmadd $T0a,$N0,$na,$T0a
- fmadd $T0b,$N0,$nb,$T0b
- insrdi $t0,$t2,16,16
- add $t3,$t3,$carry
- srdi $carry,$t3,16
-
- fmadd $T1a,$N0,$nc,$T1a
- fmadd $T1b,$N0,$nd,$T1b
- insrdi $t0,$t3,16,0 ; 0..63 bits
- fmadd $T2a,$N1,$nc,$T2a
- fmadd $T2b,$N1,$nd,$T2b
- add $t4,$t4,$carry
- fmadd $T3a,$N2,$nc,$T3a
- fmadd $T3b,$N2,$nd,$T3b
- srdi $carry,$t4,16
- fmadd $dota,$N3,$nc,$dota
- fmadd $dotb,$N3,$nd,$dotb
- add $t5,$t5,$carry
- srdi $carry,$t5,16
- insrdi $t4,$t5,16,32
-
- fctid $T0a,$T0a
- fctid $T0b,$T0b
- add $t6,$t6,$carry
- fctid $T1a,$T1a
- fctid $T1b,$T1b
- srdi $carry,$t6,16
- fctid $T2a,$T2a
- fctid $T2b,$T2b
- insrdi $t4,$t6,16,16
- fctid $T3a,$T3a
- fctid $T3b,$T3b
- add $t7,$t7,$carry
- insrdi $t4,$t7,16,0 ; 64..127 bits
- srdi $carry,$t7,16 ; upper 33 bits
-
- stfd $T0a,`$FRAME+0`($sp)
- stfd $T0b,`$FRAME+8`($sp)
- stfd $T1a,`$FRAME+16`($sp)
- stfd $T1b,`$FRAME+24`($sp)
- stfd $T2a,`$FRAME+32`($sp)
- stfd $T2b,`$FRAME+40`($sp)
- stfd $T3a,`$FRAME+48`($sp)
- stfd $T3b,`$FRAME+56`($sp)
- std $t0,8($tp) ; tp[j-1]
- stdu $t4,16($tp) ; tp[j]
- bdnz- L1st
-
- fctid $dota,$dota
- fctid $dotb,$dotb
-
- ld $t0,`$FRAME+0`($sp)
- ld $t1,`$FRAME+8`($sp)
- ld $t2,`$FRAME+16`($sp)
- ld $t3,`$FRAME+24`($sp)
- ld $t4,`$FRAME+32`($sp)
- ld $t5,`$FRAME+40`($sp)
- ld $t6,`$FRAME+48`($sp)
- ld $t7,`$FRAME+56`($sp)
- stfd $dota,`$FRAME+64`($sp)
- stfd $dotb,`$FRAME+72`($sp)
-
- add $t0,$t0,$carry ; can not overflow
- srdi $carry,$t0,16
- add $t1,$t1,$carry
- srdi $carry,$t1,16
- insrdi $t0,$t1,16,32
- add $t2,$t2,$carry
- srdi $carry,$t2,16
- insrdi $t0,$t2,16,16
- add $t3,$t3,$carry
- srdi $carry,$t3,16
- insrdi $t0,$t3,16,0 ; 0..63 bits
- add $t4,$t4,$carry
- srdi $carry,$t4,16
- add $t5,$t5,$carry
- srdi $carry,$t5,16
- insrdi $t4,$t5,16,32
- add $t6,$t6,$carry
- srdi $carry,$t6,16
- insrdi $t4,$t6,16,16
- add $t7,$t7,$carry
- insrdi $t4,$t7,16,0 ; 64..127 bits
- srdi $carry,$t7,16 ; upper 33 bits
- ld $t6,`$FRAME+64`($sp)
- ld $t7,`$FRAME+72`($sp)
-
- std $t0,8($tp) ; tp[j-1]
- stdu $t4,16($tp) ; tp[j]
-
- add $t6,$t6,$carry ; can not overflow
- srdi $carry,$t6,16
- add $t7,$t7,$carry
- insrdi $t6,$t7,48,0
- srdi $ovf,$t7,48
- std $t6,8($tp) ; tp[num-1]
-
- slwi $t7,$num,2
- subf $nap_d,$t7,$nap_d ; rewind pointer
-
- li $i,8 ; i=1
-.align 5
-Louter:
- ldx $t3,$bp,$i ; bp[i]
- ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
- mulld $t7,$a0,$t3 ; ap[0]*bp[i]
-
- addi $tp,$sp,`$FRAME+$TRANSFER`
- add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
- li $carry,0
- mulld $t7,$t7,$n0 ; tp[0]*n0
- mtctr $j
-
- ; transfer bp[i] to FPU as 4x16-bit values
- extrdi $t0,$t3,16,48
- extrdi $t1,$t3,16,32
- extrdi $t2,$t3,16,16
- extrdi $t3,$t3,16,0
- std $t0,`$FRAME+0`($sp)
- std $t1,`$FRAME+8`($sp)
- std $t2,`$FRAME+16`($sp)
- std $t3,`$FRAME+24`($sp)
- ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
- extrdi $t4,$t7,16,48
- extrdi $t5,$t7,16,32
- extrdi $t6,$t7,16,16
- extrdi $t7,$t7,16,0
- std $t4,`$FRAME+32`($sp)
- std $t5,`$FRAME+40`($sp)
- std $t6,`$FRAME+48`($sp)
- std $t7,`$FRAME+56`($sp)
-
- lfd $A0,8($nap_d) ; load a[j] in double format
- lfd $A1,16($nap_d)
- lfd $A2,24($nap_d) ; load a[j+1] in double format
- lfd $A3,32($nap_d)
- lfd $N0,40($nap_d) ; load n[j] in double format
- lfd $N1,48($nap_d)
- lfd $N2,56($nap_d) ; load n[j+1] in double format
- lfdu $N3,64($nap_d)
-
- lfd $ba,`$FRAME+0`($sp)
- lfd $bb,`$FRAME+8`($sp)
- lfd $bc,`$FRAME+16`($sp)
- lfd $bd,`$FRAME+24`($sp)
- lfd $na,`$FRAME+32`($sp)
- lfd $nb,`$FRAME+40`($sp)
- lfd $nc,`$FRAME+48`($sp)
- lfd $nd,`$FRAME+56`($sp)
-
- fcfid $ba,$ba
- fcfid $bb,$bb
- fcfid $bc,$bc
- fcfid $bd,$bd
- fcfid $na,$na
- fcfid $nb,$nb
- fcfid $nc,$nc
- fcfid $nd,$nd
-
- fmul $T1a,$A1,$ba
- fmul $T1b,$A1,$bb
- fmul $T2a,$A2,$ba
- fmul $T2b,$A2,$bb
- fmul $T3a,$A3,$ba
- fmul $T3b,$A3,$bb
- fmul $T0a,$A0,$ba
- fmul $T0b,$A0,$bb
-
- fmadd $T1a,$A0,$bc,$T1a
- fmadd $T1b,$A0,$bd,$T1b
- fmadd $T2a,$A1,$bc,$T2a
- fmadd $T2b,$A1,$bd,$T2b
- fmadd $T3a,$A2,$bc,$T3a
- fmadd $T3b,$A2,$bd,$T3b
- fmul $dota,$A3,$bc
- fmul $dotb,$A3,$bd
-
- fmadd $T1a,$N1,$na,$T1a
- fmadd $T1b,$N1,$nb,$T1b
- lfd $A0,8($nap_d) ; load a[j] in double format
- lfd $A1,16($nap_d)
- fmadd $T2a,$N2,$na,$T2a
- fmadd $T2b,$N2,$nb,$T2b
- lfd $A2,24($nap_d) ; load a[j+1] in double format
- lfd $A3,32($nap_d)
- fmadd $T3a,$N3,$na,$T3a
- fmadd $T3b,$N3,$nb,$T3b
- fmadd $T0a,$N0,$na,$T0a
- fmadd $T0b,$N0,$nb,$T0b
-
- fmadd $T1a,$N0,$nc,$T1a
- fmadd $T1b,$N0,$nd,$T1b
- fmadd $T2a,$N1,$nc,$T2a
- fmadd $T2b,$N1,$nd,$T2b
- fmadd $T3a,$N2,$nc,$T3a
- fmadd $T3b,$N2,$nd,$T3b
- fmadd $dota,$N3,$nc,$dota
- fmadd $dotb,$N3,$nd,$dotb
-
- fctid $T0a,$T0a
- fctid $T0b,$T0b
- fctid $T1a,$T1a
- fctid $T1b,$T1b
- fctid $T2a,$T2a
- fctid $T2b,$T2b
- fctid $T3a,$T3a
- fctid $T3b,$T3b
-
- stfd $T0a,`$FRAME+0`($sp)
- stfd $T0b,`$FRAME+8`($sp)
- stfd $T1a,`$FRAME+16`($sp)
- stfd $T1b,`$FRAME+24`($sp)
- stfd $T2a,`$FRAME+32`($sp)
- stfd $T2b,`$FRAME+40`($sp)
- stfd $T3a,`$FRAME+48`($sp)
- stfd $T3b,`$FRAME+56`($sp)
-
-.align 5
-Linner:
- fmul $T1a,$A1,$ba
- fmul $T1b,$A1,$bb
- fmul $T2a,$A2,$ba
- fmul $T2b,$A2,$bb
- lfd $N0,40($nap_d) ; load n[j] in double format
- lfd $N1,48($nap_d)
- fmul $T3a,$A3,$ba
- fmul $T3b,$A3,$bb
- fmadd $T0a,$A0,$ba,$dota
- fmadd $T0b,$A0,$bb,$dotb
- lfd $N2,56($nap_d) ; load n[j+1] in double format
- lfdu $N3,64($nap_d)
-
- fmadd $T1a,$A0,$bc,$T1a
- fmadd $T1b,$A0,$bd,$T1b
- fmadd $T2a,$A1,$bc,$T2a
- fmadd $T2b,$A1,$bd,$T2b
- lfd $A0,8($nap_d) ; load a[j] in double format
- lfd $A1,16($nap_d)
- fmadd $T3a,$A2,$bc,$T3a
- fmadd $T3b,$A2,$bd,$T3b
- fmul $dota,$A3,$bc
- fmul $dotb,$A3,$bd
- lfd $A2,24($nap_d) ; load a[j+1] in double format
- lfd $A3,32($nap_d)
-
- fmadd $T1a,$N1,$na,$T1a
- fmadd $T1b,$N1,$nb,$T1b
- ld $t0,`$FRAME+0`($sp)
- ld $t1,`$FRAME+8`($sp)
- fmadd $T2a,$N2,$na,$T2a
- fmadd $T2b,$N2,$nb,$T2b
- ld $t2,`$FRAME+16`($sp)
- ld $t3,`$FRAME+24`($sp)
- fmadd $T3a,$N3,$na,$T3a
- fmadd $T3b,$N3,$nb,$T3b
- add $t0,$t0,$carry ; can not overflow
- ld $t4,`$FRAME+32`($sp)
- ld $t5,`$FRAME+40`($sp)
- fmadd $T0a,$N0,$na,$T0a
- fmadd $T0b,$N0,$nb,$T0b
- srdi $carry,$t0,16
- add $t1,$t1,$carry
- srdi $carry,$t1,16
- ld $t6,`$FRAME+48`($sp)
- ld $t7,`$FRAME+56`($sp)
-
- fmadd $T1a,$N0,$nc,$T1a
- fmadd $T1b,$N0,$nd,$T1b
- insrdi $t0,$t1,16,32
- ld $t1,8($tp) ; tp[j]
- fmadd $T2a,$N1,$nc,$T2a
- fmadd $T2b,$N1,$nd,$T2b
- add $t2,$t2,$carry
- fmadd $T3a,$N2,$nc,$T3a
- fmadd $T3b,$N2,$nd,$T3b
- srdi $carry,$t2,16
- insrdi $t0,$t2,16,16
- fmadd $dota,$N3,$nc,$dota
- fmadd $dotb,$N3,$nd,$dotb
- add $t3,$t3,$carry
- ldu $t2,16($tp) ; tp[j+1]
- srdi $carry,$t3,16
- insrdi $t0,$t3,16,0 ; 0..63 bits
- add $t4,$t4,$carry
-
- fctid $T0a,$T0a
- fctid $T0b,$T0b
- srdi $carry,$t4,16
- fctid $T1a,$T1a
- fctid $T1b,$T1b
- add $t5,$t5,$carry
- fctid $T2a,$T2a
- fctid $T2b,$T2b
- srdi $carry,$t5,16
- insrdi $t4,$t5,16,32
- fctid $T3a,$T3a
- fctid $T3b,$T3b
- add $t6,$t6,$carry
- srdi $carry,$t6,16
- insrdi $t4,$t6,16,16
-
- stfd $T0a,`$FRAME+0`($sp)
- stfd $T0b,`$FRAME+8`($sp)
- add $t7,$t7,$carry
- addc $t3,$t0,$t1
- stfd $T1a,`$FRAME+16`($sp)
- stfd $T1b,`$FRAME+24`($sp)
- insrdi $t4,$t7,16,0 ; 64..127 bits
- srdi $carry,$t7,16 ; upper 33 bits
- stfd $T2a,`$FRAME+32`($sp)
- stfd $T2b,`$FRAME+40`($sp)
- adde $t5,$t4,$t2
- stfd $T3a,`$FRAME+48`($sp)
- stfd $T3b,`$FRAME+56`($sp)
- addze $carry,$carry
- std $t3,-16($tp) ; tp[j-1]
- std $t5,-8($tp) ; tp[j]
- bdnz- Linner
-
- fctid $dota,$dota
- fctid $dotb,$dotb
- ld $t0,`$FRAME+0`($sp)
- ld $t1,`$FRAME+8`($sp)
- ld $t2,`$FRAME+16`($sp)
- ld $t3,`$FRAME+24`($sp)
- ld $t4,`$FRAME+32`($sp)
- ld $t5,`$FRAME+40`($sp)
- ld $t6,`$FRAME+48`($sp)
- ld $t7,`$FRAME+56`($sp)
- stfd $dota,`$FRAME+64`($sp)
- stfd $dotb,`$FRAME+72`($sp)
-
- add $t0,$t0,$carry ; can not overflow
- srdi $carry,$t0,16
- add $t1,$t1,$carry
- srdi $carry,$t1,16
- insrdi $t0,$t1,16,32
- add $t2,$t2,$carry
- ld $t1,8($tp) ; tp[j]
- srdi $carry,$t2,16
- insrdi $t0,$t2,16,16
- add $t3,$t3,$carry
- ldu $t2,16($tp) ; tp[j+1]
- srdi $carry,$t3,16
- insrdi $t0,$t3,16,0 ; 0..63 bits
- add $t4,$t4,$carry
- srdi $carry,$t4,16
- add $t5,$t5,$carry
- srdi $carry,$t5,16
- insrdi $t4,$t5,16,32
- add $t6,$t6,$carry
- srdi $carry,$t6,16
- insrdi $t4,$t6,16,16
- add $t7,$t7,$carry
- insrdi $t4,$t7,16,0 ; 64..127 bits
- srdi $carry,$t7,16 ; upper 33 bits
- ld $t6,`$FRAME+64`($sp)
- ld $t7,`$FRAME+72`($sp)
-
- addc $t3,$t0,$t1
- adde $t5,$t4,$t2
- addze $carry,$carry
-
- std $t3,-16($tp) ; tp[j-1]
- std $t5,-8($tp) ; tp[j]
-
- add $carry,$carry,$ovf ; comsume upmost overflow
- add $t6,$t6,$carry ; can not overflow
- srdi $carry,$t6,16
- add $t7,$t7,$carry
- insrdi $t6,$t7,48,0
- srdi $ovf,$t7,48
- std $t6,0($tp) ; tp[num-1]
-
- slwi $t7,$num,2
- addi $i,$i,8
- subf $nap_d,$t7,$nap_d ; rewind pointer
- cmpw $i,$num
- blt- Louter
-
- subf $np,$num,$np ; rewind np
- addi $j,$j,1 ; restore counter
- subfc $i,$i,$i ; j=0 and "clear" XER[CA]
- addi $tp,$sp,`$FRAME+$TRANSFER+8`
- addi $t4,$sp,`$FRAME+$TRANSFER+16`
- addi $t5,$np,8
- addi $t6,$rp,8
- mtctr $j
-
-.align 4
-Lsub: ldx $t0,$tp,$i
- ldx $t1,$np,$i
- ldx $t2,$t4,$i
- ldx $t3,$t5,$i
- subfe $t0,$t1,$t0 ; tp[j]-np[j]
- subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
- stdx $t0,$rp,$i
- stdx $t2,$t6,$i
- addi $i,$i,16
- bdnz- Lsub
-
- li $i,0
- subfe $ovf,$i,$ovf ; handle upmost overflow bit
- and $ap,$tp,$ovf
- andc $np,$rp,$ovf
- or $ap,$ap,$np ; ap=borrow?tp:rp
- addi $t7,$ap,8
- mtctr $j
-
-.align 4
-Lcopy: ; copy or in-place refresh
- ldx $t0,$ap,$i
- ldx $t1,$t7,$i
- std $i,8($nap_d) ; zap nap_d
- std $i,16($nap_d)
- std $i,24($nap_d)
- std $i,32($nap_d)
- std $i,40($nap_d)
- std $i,48($nap_d)
- std $i,56($nap_d)
- stdu $i,64($nap_d)
- stdx $t0,$rp,$i
- stdx $t1,$t6,$i
- stdx $i,$tp,$i ; zap tp at once
- stdx $i,$t4,$i
- addi $i,$i,16
- bdnz- Lcopy
-
- $POP r14,`2*$SIZE_T`($sp)
- $POP r15,`3*$SIZE_T`($sp)
- $POP r16,`4*$SIZE_T`($sp)
- $POP r17,`5*$SIZE_T`($sp)
- $POP r18,`6*$SIZE_T`($sp)
- $POP r19,`7*$SIZE_T`($sp)
- $POP r20,`8*$SIZE_T`($sp)
- $POP r21,`9*$SIZE_T`($sp)
- $POP r22,`10*$SIZE_T`($sp)
- $POP r23,`11*$SIZE_T`($sp)
- lfd f14,`12*$SIZE_T+0`($sp)
- lfd f15,`12*$SIZE_T+8`($sp)
- lfd f16,`12*$SIZE_T+16`($sp)
- lfd f17,`12*$SIZE_T+24`($sp)
- lfd f18,`12*$SIZE_T+32`($sp)
- lfd f19,`12*$SIZE_T+40`($sp)
- lfd f20,`12*$SIZE_T+48`($sp)
- lfd f21,`12*$SIZE_T+56`($sp)
- lfd f22,`12*$SIZE_T+64`($sp)
- lfd f23,`12*$SIZE_T+72`($sp)
- lfd f24,`12*$SIZE_T+80`($sp)
- lfd f25,`12*$SIZE_T+88`($sp)
- $POP $sp,0($sp)
- li r3,1 ; signal "handled"
- blr
- .long 0
-.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl
deleted file mode 100755
index d23251033b00..000000000000
--- a/crypto/bn/asm/s390x-mont.pl
+++ /dev/null
@@ -1,225 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# April 2007.
-#
-# Performance improvement over vanilla C code varies from 85% to 45%
-# depending on key length and benchmark. Unfortunately in this context
-# these are not very impressive results [for code that utilizes "wide"
-# 64x64=128-bit multiplication, which is not commonly available to C
-# programmers], at least hand-coded bn_asm.c replacement is known to
-# provide 30-40% better results for longest keys. Well, on a second
-# thought it's not very surprising, because z-CPUs are single-issue
-# and _strictly_ in-order execution, while bn_mul_mont is more or less
-# dependent on CPU ability to pipe-line instructions and have several
-# of them "in-flight" at the same time. I mean while other methods,
-# for example Karatsuba, aim to minimize amount of multiplications at
-# the cost of other operations increase, bn_mul_mont aim to neatly
-# "overlap" multiplications and the other operations [and on most
-# platforms even minimize the amount of the other operations, in
-# particular references to memory]. But it's possible to improve this
-# module performance by implementing dedicated squaring code-path and
-# possibly by unrolling loops...
-
-# January 2009.
-#
-# Reschedule to minimize/avoid Address Generation Interlock hazard,
-# make inner loops counter-based.
-
-$mn0="%r0";
-$num="%r1";
-
-# int bn_mul_mont(
-$rp="%r2"; # BN_ULONG *rp,
-$ap="%r3"; # const BN_ULONG *ap,
-$bp="%r4"; # const BN_ULONG *bp,
-$np="%r5"; # const BN_ULONG *np,
-$n0="%r6"; # const BN_ULONG *n0,
-#$num="160(%r15)" # int num);
-
-$bi="%r2"; # zaps rp
-$j="%r7";
-
-$ahi="%r8";
-$alo="%r9";
-$nhi="%r10";
-$nlo="%r11";
-$AHI="%r12";
-$NHI="%r13";
-$count="%r14";
-$sp="%r15";
-
-$code.=<<___;
-.text
-.globl bn_mul_mont
-.type bn_mul_mont,\@function
-bn_mul_mont:
- lgf $num,164($sp) # pull $num
- sla $num,3 # $num to enumerate bytes
- la $bp,0($num,$bp)
-
- stg %r2,16($sp)
-
- cghi $num,16 #
- lghi %r2,0 #
- blr %r14 # if($num<16) return 0;
- cghi $num,128 #
- bhr %r14 # if($num>128) return 0;
-
- stmg %r3,%r15,24($sp)
-
- lghi $rp,-160-8 # leave room for carry bit
- lcgr $j,$num # -$num
- lgr %r0,$sp
- la $rp,0($rp,$sp)
- la $sp,0($j,$rp) # alloca
- stg %r0,0($sp) # back chain
-
- sra $num,3 # restore $num
- la $bp,0($j,$bp) # restore $bp
- ahi $num,-1 # adjust $num for inner loop
- lg $n0,0($n0) # pull n0
-
- lg $bi,0($bp)
- lg $alo,0($ap)
- mlgr $ahi,$bi # ap[0]*bp[0]
- lgr $AHI,$ahi
-
- lgr $mn0,$alo # "tp[0]"*n0
- msgr $mn0,$n0
-
- lg $nlo,0($np) #
- mlgr $nhi,$mn0 # np[0]*m1
- algr $nlo,$alo # +="tp[0]"
- lghi $NHI,0
- alcgr $NHI,$nhi
-
- la $j,8(%r0) # j=1
- lr $count,$num
-
-.align 16
-.L1st:
- lg $alo,0($j,$ap)
- mlgr $ahi,$bi # ap[j]*bp[0]
- algr $alo,$AHI
- lghi $AHI,0
- alcgr $AHI,$ahi
-
- lg $nlo,0($j,$np)
- mlgr $nhi,$mn0 # np[j]*m1
- algr $nlo,$NHI
- lghi $NHI,0
- alcgr $nhi,$NHI # +="tp[j]"
- algr $nlo,$alo
- alcgr $NHI,$nhi
-
- stg $nlo,160-8($j,$sp) # tp[j-1]=
- la $j,8($j) # j++
- brct $count,.L1st
-
- algr $NHI,$AHI
- lghi $AHI,0
- alcgr $AHI,$AHI # upmost overflow bit
- stg $NHI,160-8($j,$sp)
- stg $AHI,160($j,$sp)
- la $bp,8($bp) # bp++
-
-.Louter:
- lg $bi,0($bp) # bp[i]
- lg $alo,0($ap)
- mlgr $ahi,$bi # ap[0]*bp[i]
- alg $alo,160($sp) # +=tp[0]
- lghi $AHI,0
- alcgr $AHI,$ahi
-
- lgr $mn0,$alo
- msgr $mn0,$n0 # tp[0]*n0
-
- lg $nlo,0($np) # np[0]
- mlgr $nhi,$mn0 # np[0]*m1
- algr $nlo,$alo # +="tp[0]"
- lghi $NHI,0
- alcgr $NHI,$nhi
-
- la $j,8(%r0) # j=1
- lr $count,$num
-
-.align 16
-.Linner:
- lg $alo,0($j,$ap)
- mlgr $ahi,$bi # ap[j]*bp[i]
- algr $alo,$AHI
- lghi $AHI,0
- alcgr $ahi,$AHI
- alg $alo,160($j,$sp)# +=tp[j]
- alcgr $AHI,$ahi
-
- lg $nlo,0($j,$np)
- mlgr $nhi,$mn0 # np[j]*m1
- algr $nlo,$NHI
- lghi $NHI,0
- alcgr $nhi,$NHI
- algr $nlo,$alo # +="tp[j]"
- alcgr $NHI,$nhi
-
- stg $nlo,160-8($j,$sp) # tp[j-1]=
- la $j,8($j) # j++
- brct $count,.Linner
-
- algr $NHI,$AHI
- lghi $AHI,0
- alcgr $AHI,$AHI
- alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit
- lghi $ahi,0
- alcgr $AHI,$ahi # new upmost overflow bit
- stg $NHI,160-8($j,$sp)
- stg $AHI,160($j,$sp)
-
- la $bp,8($bp) # bp++
- clg $bp,160+8+32($j,$sp) # compare to &bp[num]
- jne .Louter
-
- lg $rp,160+8+16($j,$sp) # reincarnate rp
- la $ap,160($sp)
- ahi $num,1 # restore $num, incidentally clears "borrow"
-
- la $j,0(%r0)
- lr $count,$num
-.Lsub: lg $alo,0($j,$ap)
- slbg $alo,0($j,$np)
- stg $alo,0($j,$rp)
- la $j,8($j)
- brct $count,.Lsub
- lghi $ahi,0
- slbgr $AHI,$ahi # handle upmost carry
-
- ngr $ap,$AHI
- lghi $np,-1
- xgr $np,$AHI
- ngr $np,$rp
- ogr $ap,$np # ap=borrow?tp:rp
-
- la $j,0(%r0)
- lgr $count,$num
-.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
- stg $j,160($j,$sp) # zap tp
- stg $alo,0($j,$rp)
- la $j,8($j)
- brct $count,.Lcopy
-
- la %r1,160+8+48($j,$sp)
- lmg %r6,%r15,0(%r1)
- lghi %r2,1 # signal "processed"
- br %r14
-.size bn_mul_mont,.-bn_mul_mont
-.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
-
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/s390x.S b/crypto/bn/asm/s390x.S
deleted file mode 100755
index 8f45f5d513ce..000000000000
--- a/crypto/bn/asm/s390x.S
+++ /dev/null
@@ -1,678 +0,0 @@
-.ident "s390x.S, version 1.0"
-// ====================================================================
-// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-// project.
-//
-// Rights for redistribution and usage in source and binary forms are
-// granted according to the OpenSSL license. Warranty of any kind is
-// disclaimed.
-// ====================================================================
-
-.text
-
-#define zero %r0
-
-// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
-.globl bn_mul_add_words
-.type bn_mul_add_words,@function
-.align 4
-bn_mul_add_words:
- lghi zero,0 // zero = 0
- la %r1,0(%r2) // put rp aside
- lghi %r2,0 // i=0;
- ltgfr %r4,%r4
- bler %r14 // if (len<=0) return 0;
-
- stmg %r6,%r10,48(%r15)
- lghi %r8,0 // carry = 0
- srag %r10,%r4,2 // cnt=len/4
- jz .Loop1_madd
-
-.Loop4_madd:
- lg %r7,0(%r2,%r3) // ap[i]
- mlgr %r6,%r5 // *=w
- algr %r7,%r8 // +=carry
- alcgr %r6,zero
- alg %r7,0(%r2,%r1) // +=rp[i]
- alcgr %r6,zero
- stg %r7,0(%r2,%r1) // rp[i]=
-
- lg %r9,8(%r2,%r3)
- mlgr %r8,%r5
- algr %r9,%r6
- alcgr %r8,zero
- alg %r9,8(%r2,%r1)
- alcgr %r8,zero
- stg %r9,8(%r2,%r1)
-
- lg %r7,16(%r2,%r3)
- mlgr %r6,%r5
- algr %r7,%r8
- alcgr %r6,zero
- alg %r7,16(%r2,%r1)
- alcgr %r6,zero
- stg %r7,16(%r2,%r1)
-
- lg %r9,24(%r2,%r3)
- mlgr %r8,%r5
- algr %r9,%r6
- alcgr %r8,zero
- alg %r9,24(%r2,%r1)
- alcgr %r8,zero
- stg %r9,24(%r2,%r1)
-
- la %r2,32(%r2) // i+=4
- brct %r10,.Loop4_madd
-
- lghi %r10,3
- nr %r4,%r10 // cnt=len%4
- jz .Lend_madd
-
-.Loop1_madd:
- lg %r7,0(%r2,%r3) // ap[i]
- mlgr %r6,%r5 // *=w
- algr %r7,%r8 // +=carry
- alcgr %r6,zero
- alg %r7,0(%r2,%r1) // +=rp[i]
- alcgr %r6,zero
- stg %r7,0(%r2,%r1) // rp[i]=
-
- lgr %r8,%r6
- la %r2,8(%r2) // i++
- brct %r4,.Loop1_madd
-
-.Lend_madd:
- lgr %r2,%r8
- lmg %r6,%r10,48(%r15)
- br %r14
-.size bn_mul_add_words,.-bn_mul_add_words
-
-// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
-.globl bn_mul_words
-.type bn_mul_words,@function
-.align 4
-bn_mul_words:
- lghi zero,0 // zero = 0
- la %r1,0(%r2) // put rp aside
- lghi %r2,0 // i=0;
- ltgfr %r4,%r4
- bler %r14 // if (len<=0) return 0;
-
- stmg %r6,%r10,48(%r15)
- lghi %r8,0 // carry = 0
- srag %r10,%r4,2 // cnt=len/4
- jz .Loop1_mul
-
-.Loop4_mul:
- lg %r7,0(%r2,%r3) // ap[i]
- mlgr %r6,%r5 // *=w
- algr %r7,%r8 // +=carry
- alcgr %r6,zero
- stg %r7,0(%r2,%r1) // rp[i]=
-
- lg %r9,8(%r2,%r3)
- mlgr %r8,%r5
- algr %r9,%r6
- alcgr %r8,zero
- stg %r9,8(%r2,%r1)
-
- lg %r7,16(%r2,%r3)
- mlgr %r6,%r5
- algr %r7,%r8
- alcgr %r6,zero
- stg %r7,16(%r2,%r1)
-
- lg %r9,24(%r2,%r3)
- mlgr %r8,%r5
- algr %r9,%r6
- alcgr %r8,zero
- stg %r9,24(%r2,%r1)
-
- la %r2,32(%r2) // i+=4
- brct %r10,.Loop4_mul
-
- lghi %r10,3
- nr %r4,%r10 // cnt=len%4
- jz .Lend_mul
-
-.Loop1_mul:
- lg %r7,0(%r2,%r3) // ap[i]
- mlgr %r6,%r5 // *=w
- algr %r7,%r8 // +=carry
- alcgr %r6,zero
- stg %r7,0(%r2,%r1) // rp[i]=
-
- lgr %r8,%r6
- la %r2,8(%r2) // i++
- brct %r4,.Loop1_mul
-
-.Lend_mul:
- lgr %r2,%r8
- lmg %r6,%r10,48(%r15)
- br %r14
-.size bn_mul_words,.-bn_mul_words
-
-// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
-.globl bn_sqr_words
-.type bn_sqr_words,@function
-.align 4
-bn_sqr_words:
- ltgfr %r4,%r4
- bler %r14
-
- stmg %r6,%r7,48(%r15)
- srag %r1,%r4,2 // cnt=len/4
- jz .Loop1_sqr
-
-.Loop4_sqr:
- lg %r7,0(%r3)
- mlgr %r6,%r7
- stg %r7,0(%r2)
- stg %r6,8(%r2)
-
- lg %r7,8(%r3)
- mlgr %r6,%r7
- stg %r7,16(%r2)
- stg %r6,24(%r2)
-
- lg %r7,16(%r3)
- mlgr %r6,%r7
- stg %r7,32(%r2)
- stg %r6,40(%r2)
-
- lg %r7,24(%r3)
- mlgr %r6,%r7
- stg %r7,48(%r2)
- stg %r6,56(%r2)
-
- la %r3,32(%r3)
- la %r2,64(%r2)
- brct %r1,.Loop4_sqr
-
- lghi %r1,3
- nr %r4,%r1 // cnt=len%4
- jz .Lend_sqr
-
-.Loop1_sqr:
- lg %r7,0(%r3)
- mlgr %r6,%r7
- stg %r7,0(%r2)
- stg %r6,8(%r2)
-
- la %r3,8(%r3)
- la %r2,16(%r2)
- brct %r4,.Loop1_sqr
-
-.Lend_sqr:
- lmg %r6,%r7,48(%r15)
- br %r14
-.size bn_sqr_words,.-bn_sqr_words
-
-// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
-.globl bn_div_words
-.type bn_div_words,@function
-.align 4
-bn_div_words:
- dlgr %r2,%r4
- lgr %r2,%r3
- br %r14
-.size bn_div_words,.-bn_div_words
-
-// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
-.globl bn_add_words
-.type bn_add_words,@function
-.align 4
-bn_add_words:
- la %r1,0(%r2) // put rp aside
- lghi %r2,0 // i=0
- ltgfr %r5,%r5
- bler %r14 // if (len<=0) return 0;
-
- stg %r6,48(%r15)
- lghi %r6,3
- nr %r6,%r5 // len%4
- sra %r5,2 // len/4, use sra because it sets condition code
- jz .Loop1_add // carry is incidentally cleared if branch taken
- algr %r2,%r2 // clear carry
-
-.Loop4_add:
- lg %r0,0(%r2,%r3)
- alcg %r0,0(%r2,%r4)
- stg %r0,0(%r2,%r1)
- lg %r0,8(%r2,%r3)
- alcg %r0,8(%r2,%r4)
- stg %r0,8(%r2,%r1)
- lg %r0,16(%r2,%r3)
- alcg %r0,16(%r2,%r4)
- stg %r0,16(%r2,%r1)
- lg %r0,24(%r2,%r3)
- alcg %r0,24(%r2,%r4)
- stg %r0,24(%r2,%r1)
-
- la %r2,32(%r2) // i+=4
- brct %r5,.Loop4_add
-
- la %r6,1(%r6) // see if len%4 is zero ...
- brct %r6,.Loop1_add // without touching condition code:-)
-
-.Lexit_add:
- lghi %r2,0
- alcgr %r2,%r2
- lg %r6,48(%r15)
- br %r14
-
-.Loop1_add:
- lg %r0,0(%r2,%r3)
- alcg %r0,0(%r2,%r4)
- stg %r0,0(%r2,%r1)
-
- la %r2,8(%r2) // i++
- brct %r6,.Loop1_add
-
- j .Lexit_add
-.size bn_add_words,.-bn_add_words
-
-// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
-.globl bn_sub_words
-.type bn_sub_words,@function
-.align 4
-bn_sub_words:
- la %r1,0(%r2) // put rp aside
- lghi %r2,0 // i=0
- ltgfr %r5,%r5
- bler %r14 // if (len<=0) return 0;
-
- stg %r6,48(%r15)
- lghi %r6,3
- nr %r6,%r5 // len%4
- sra %r5,2 // len/4, use sra because it sets condition code
- jnz .Loop4_sub // borrow is incidentally cleared if branch taken
- slgr %r2,%r2 // clear borrow
-
-.Loop1_sub:
- lg %r0,0(%r2,%r3)
- slbg %r0,0(%r2,%r4)
- stg %r0,0(%r2,%r1)
-
- la %r2,8(%r2) // i++
- brct %r6,.Loop1_sub
- j .Lexit_sub
-
-.Loop4_sub:
- lg %r0,0(%r2,%r3)
- slbg %r0,0(%r2,%r4)
- stg %r0,0(%r2,%r1)
- lg %r0,8(%r2,%r3)
- slbg %r0,8(%r2,%r4)
- stg %r0,8(%r2,%r1)
- lg %r0,16(%r2,%r3)
- slbg %r0,16(%r2,%r4)
- stg %r0,16(%r2,%r1)
- lg %r0,24(%r2,%r3)
- slbg %r0,24(%r2,%r4)
- stg %r0,24(%r2,%r1)
-
- la %r2,32(%r2) // i+=4
- brct %r5,.Loop4_sub
-
- la %r6,1(%r6) // see if len%4 is zero ...
- brct %r6,.Loop1_sub // without touching condition code:-)
-
-.Lexit_sub:
- lghi %r2,0
- slbgr %r2,%r2
- lcgr %r2,%r2
- lg %r6,48(%r15)
- br %r14
-.size bn_sub_words,.-bn_sub_words
-
-#define c1 %r1
-#define c2 %r5
-#define c3 %r8
-
-#define mul_add_c(ai,bi,c1,c2,c3) \
- lg %r7,ai*8(%r3); \
- mlg %r6,bi*8(%r4); \
- algr c1,%r7; \
- alcgr c2,%r6; \
- alcgr c3,zero
-
-// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
-.globl bn_mul_comba8
-.type bn_mul_comba8,@function
-.align 4
-bn_mul_comba8:
- stmg %r6,%r8,48(%r15)
-
- lghi c1,0
- lghi c2,0
- lghi c3,0
- lghi zero,0
-
- mul_add_c(0,0,c1,c2,c3);
- stg c1,0*8(%r2)
- lghi c1,0
-
- mul_add_c(0,1,c2,c3,c1);
- mul_add_c(1,0,c2,c3,c1);
- stg c2,1*8(%r2)
- lghi c2,0
-
- mul_add_c(2,0,c3,c1,c2);
- mul_add_c(1,1,c3,c1,c2);
- mul_add_c(0,2,c3,c1,c2);
- stg c3,2*8(%r2)
- lghi c3,0
-
- mul_add_c(0,3,c1,c2,c3);
- mul_add_c(1,2,c1,c2,c3);
- mul_add_c(2,1,c1,c2,c3);
- mul_add_c(3,0,c1,c2,c3);
- stg c1,3*8(%r2)
- lghi c1,0
-
- mul_add_c(4,0,c2,c3,c1);
- mul_add_c(3,1,c2,c3,c1);
- mul_add_c(2,2,c2,c3,c1);
- mul_add_c(1,3,c2,c3,c1);
- mul_add_c(0,4,c2,c3,c1);
- stg c2,4*8(%r2)
- lghi c2,0
-
- mul_add_c(0,5,c3,c1,c2);
- mul_add_c(1,4,c3,c1,c2);
- mul_add_c(2,3,c3,c1,c2);
- mul_add_c(3,2,c3,c1,c2);
- mul_add_c(4,1,c3,c1,c2);
- mul_add_c(5,0,c3,c1,c2);
- stg c3,5*8(%r2)
- lghi c3,0
-
- mul_add_c(6,0,c1,c2,c3);
- mul_add_c(5,1,c1,c2,c3);
- mul_add_c(4,2,c1,c2,c3);
- mul_add_c(3,3,c1,c2,c3);
- mul_add_c(2,4,c1,c2,c3);
- mul_add_c(1,5,c1,c2,c3);
- mul_add_c(0,6,c1,c2,c3);
- stg c1,6*8(%r2)
- lghi c1,0
-
- mul_add_c(0,7,c2,c3,c1);
- mul_add_c(1,6,c2,c3,c1);
- mul_add_c(2,5,c2,c3,c1);
- mul_add_c(3,4,c2,c3,c1);
- mul_add_c(4,3,c2,c3,c1);
- mul_add_c(5,2,c2,c3,c1);
- mul_add_c(6,1,c2,c3,c1);
- mul_add_c(7,0,c2,c3,c1);
- stg c2,7*8(%r2)
- lghi c2,0
-
- mul_add_c(7,1,c3,c1,c2);
- mul_add_c(6,2,c3,c1,c2);
- mul_add_c(5,3,c3,c1,c2);
- mul_add_c(4,4,c3,c1,c2);
- mul_add_c(3,5,c3,c1,c2);
- mul_add_c(2,6,c3,c1,c2);
- mul_add_c(1,7,c3,c1,c2);
- stg c3,8*8(%r2)
- lghi c3,0
-
- mul_add_c(2,7,c1,c2,c3);
- mul_add_c(3,6,c1,c2,c3);
- mul_add_c(4,5,c1,c2,c3);
- mul_add_c(5,4,c1,c2,c3);
- mul_add_c(6,3,c1,c2,c3);
- mul_add_c(7,2,c1,c2,c3);
- stg c1,9*8(%r2)
- lghi c1,0
-
- mul_add_c(7,3,c2,c3,c1);
- mul_add_c(6,4,c2,c3,c1);
- mul_add_c(5,5,c2,c3,c1);
- mul_add_c(4,6,c2,c3,c1);
- mul_add_c(3,7,c2,c3,c1);
- stg c2,10*8(%r2)
- lghi c2,0
-
- mul_add_c(4,7,c3,c1,c2);
- mul_add_c(5,6,c3,c1,c2);
- mul_add_c(6,5,c3,c1,c2);
- mul_add_c(7,4,c3,c1,c2);
- stg c3,11*8(%r2)
- lghi c3,0
-
- mul_add_c(7,5,c1,c2,c3);
- mul_add_c(6,6,c1,c2,c3);
- mul_add_c(5,7,c1,c2,c3);
- stg c1,12*8(%r2)
- lghi c1,0
-
-
- mul_add_c(6,7,c2,c3,c1);
- mul_add_c(7,6,c2,c3,c1);
- stg c2,13*8(%r2)
- lghi c2,0
-
- mul_add_c(7,7,c3,c1,c2);
- stg c3,14*8(%r2)
- stg c1,15*8(%r2)
-
- lmg %r6,%r8,48(%r15)
- br %r14
-.size bn_mul_comba8,.-bn_mul_comba8
-
-// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
-.globl bn_mul_comba4
-.type bn_mul_comba4,@function
-.align 4
-bn_mul_comba4:
- stmg %r6,%r8,48(%r15)
-
- lghi c1,0
- lghi c2,0
- lghi c3,0
- lghi zero,0
-
- mul_add_c(0,0,c1,c2,c3);
- stg c1,0*8(%r3)
- lghi c1,0
-
- mul_add_c(0,1,c2,c3,c1);
- mul_add_c(1,0,c2,c3,c1);
- stg c2,1*8(%r2)
- lghi c2,0
-
- mul_add_c(2,0,c3,c1,c2);
- mul_add_c(1,1,c3,c1,c2);
- mul_add_c(0,2,c3,c1,c2);
- stg c3,2*8(%r2)
- lghi c3,0
-
- mul_add_c(0,3,c1,c2,c3);
- mul_add_c(1,2,c1,c2,c3);
- mul_add_c(2,1,c1,c2,c3);
- mul_add_c(3,0,c1,c2,c3);
- stg c1,3*8(%r2)
- lghi c1,0
-
- mul_add_c(3,1,c2,c3,c1);
- mul_add_c(2,2,c2,c3,c1);
- mul_add_c(1,3,c2,c3,c1);
- stg c2,4*8(%r2)
- lghi c2,0
-
- mul_add_c(2,3,c3,c1,c2);
- mul_add_c(3,2,c3,c1,c2);
- stg c3,5*8(%r2)
- lghi c3,0
-
- mul_add_c(3,3,c1,c2,c3);
- stg c1,6*8(%r2)
- stg c2,7*8(%r2)
-
- stmg %r6,%r8,48(%r15)
- br %r14
-.size bn_mul_comba4,.-bn_mul_comba4
-
-#define sqr_add_c(ai,c1,c2,c3) \
- lg %r7,ai*8(%r3); \
- mlgr %r6,%r7; \
- algr c1,%r7; \
- alcgr c2,%r6; \
- alcgr c3,zero
-
-#define sqr_add_c2(ai,aj,c1,c2,c3) \
- lg %r7,ai*8(%r3); \
- mlg %r6,aj*8(%r3); \
- algr c1,%r7; \
- alcgr c2,%r6; \
- alcgr c3,zero; \
- algr c1,%r7; \
- alcgr c2,%r6; \
- alcgr c3,zero
-
-// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
-.globl bn_sqr_comba8
-.type bn_sqr_comba8,@function
-.align 4
-bn_sqr_comba8:
- stmg %r6,%r8,48(%r15)
-
- lghi c1,0
- lghi c2,0
- lghi c3,0
- lghi zero,0
-
- sqr_add_c(0,c1,c2,c3);
- stg c1,0*8(%r2)
- lghi c1,0
-
- sqr_add_c2(1,0,c2,c3,c1);
- stg c2,1*8(%r2)
- lghi c2,0
-
- sqr_add_c(1,c3,c1,c2);
- sqr_add_c2(2,0,c3,c1,c2);
- stg c3,2*8(%r2)
- lghi c3,0
-
- sqr_add_c2(3,0,c1,c2,c3);
- sqr_add_c2(2,1,c1,c2,c3);
- stg c1,3*8(%r2)
- lghi c1,0
-
- sqr_add_c(2,c2,c3,c1);
- sqr_add_c2(3,1,c2,c3,c1);
- sqr_add_c2(4,0,c2,c3,c1);
- stg c2,4*8(%r2)
- lghi c2,0
-
- sqr_add_c2(5,0,c3,c1,c2);
- sqr_add_c2(4,1,c3,c1,c2);
- sqr_add_c2(3,2,c3,c1,c2);
- stg c3,5*8(%r2)
- lghi c3,0
-
- sqr_add_c(3,c1,c2,c3);
- sqr_add_c2(4,2,c1,c2,c3);
- sqr_add_c2(5,1,c1,c2,c3);
- sqr_add_c2(6,0,c1,c2,c3);
- stg c1,6*8(%r2)
- lghi c1,0
-
- sqr_add_c2(7,0,c2,c3,c1);
- sqr_add_c2(6,1,c2,c3,c1);
- sqr_add_c2(5,2,c2,c3,c1);
- sqr_add_c2(4,3,c2,c3,c1);
- stg c2,7*8(%r2)
- lghi c2,0
-
- sqr_add_c(4,c3,c1,c2);
- sqr_add_c2(5,3,c3,c1,c2);
- sqr_add_c2(6,2,c3,c1,c2);
- sqr_add_c2(7,1,c3,c1,c2);
- stg c3,8*8(%r2)
- lghi c3,0
-
- sqr_add_c2(7,2,c1,c2,c3);
- sqr_add_c2(6,3,c1,c2,c3);
- sqr_add_c2(5,4,c1,c2,c3);
- stg c1,9*8(%r2)
- lghi c1,0
-
- sqr_add_c(5,c2,c3,c1);
- sqr_add_c2(6,4,c2,c3,c1);
- sqr_add_c2(7,3,c2,c3,c1);
- stg c2,10*8(%r2)
- lghi c2,0
-
- sqr_add_c2(7,4,c3,c1,c2);
- sqr_add_c2(6,5,c3,c1,c2);
- stg c3,11*8(%r2)
- lghi c3,0
-
- sqr_add_c(6,c1,c2,c3);
- sqr_add_c2(7,5,c1,c2,c3);
- stg c1,12*8(%r2)
- lghi c1,0
-
- sqr_add_c2(7,6,c2,c3,c1);
- stg c2,13*8(%r2)
- lghi c2,0
-
- sqr_add_c(7,c3,c1,c2);
- stg c3,14*8(%r2)
- stg c1,15*8(%r2)
-
- lmg %r6,%r8,48(%r15)
- br %r14
-.size bn_sqr_comba8,.-bn_sqr_comba8
-
-// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
-.globl bn_sqr_comba4
-.type bn_sqr_comba4,@function
-.align 4
-bn_sqr_comba4:
- stmg %r6,%r8,48(%r15)
-
- lghi c1,0
- lghi c2,0
- lghi c3,0
- lghi zero,0
-
- sqr_add_c(0,c1,c2,c3);
- stg c1,0*8(%r2)
- lghi c1,0
-
- sqr_add_c2(1,0,c2,c3,c1);
- stg c2,1*8(%r2)
- lghi c2,0
-
- sqr_add_c(1,c3,c1,c2);
- sqr_add_c2(2,0,c3,c1,c2);
- stg c3,2*8(%r2)
- lghi c3,0
-
- sqr_add_c2(3,0,c1,c2,c3);
- sqr_add_c2(2,1,c1,c2,c3);
- stg c1,3*8(%r2)
- lghi c1,0
-
- sqr_add_c(2,c2,c3,c1);
- sqr_add_c2(3,1,c2,c3,c1);
- stg c2,4*8(%r2)
- lghi c2,0
-
- sqr_add_c2(3,2,c3,c1,c2);
- stg c3,5*8(%r2)
- lghi c3,0
-
- sqr_add_c(3,c1,c2,c3);
- stg c1,6*8(%r2)
- stg c2,7*8(%r2)
-
- lmg %r6,%r8,48(%r15)
- br %r14
-.size bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl
deleted file mode 100755
index b8fb1e8a25dc..000000000000
--- a/crypto/bn/asm/sparcv9-mont.pl
+++ /dev/null
@@ -1,606 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# December 2005
-#
-# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
-# for undertaken effort are multiple. First of all, UltraSPARC is not
-# the whole SPARCv9 universe and other VIS-free implementations deserve
-# optimized code as much. Secondly, newly introduced UltraSPARC T1,
-# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
-# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
-# several integrated RSA/DSA accelerator circuits accessible through
-# kernel driver [only(*)], but having decent user-land software
-# implementation is important too. Finally, reasons like desire to
-# experiment with dedicated squaring procedure. Yes, this module
-# implements one, because it was easiest to draft it in SPARCv9
-# instructions...
-
-# (*) Engine accessing the driver in question is on my TODO list.
-# For reference, acceleator is estimated to give 6 to 10 times
-# improvement on single-threaded RSA sign. It should be noted
-# that 6-10x improvement coefficient does not actually mean
-# something extraordinary in terms of absolute [single-threaded]
-# performance, as SPARCv9 instruction set is by all means least
-# suitable for high performance crypto among other 64 bit
-# platforms. 6-10x factor simply places T1 in same performance
-# domain as say AMD64 and IA-64. Improvement of RSA verify don't
-# appear impressive at all, but it's the sign operation which is
-# far more critical/interesting.
-
-# You might notice that inner loops are modulo-scheduled:-) This has
-# essentially negligible impact on UltraSPARC performance, it's
-# Fujitsu SPARC64 V users who should notice and hopefully appreciate
-# the advantage... Currently this module surpasses sparcv9a-mont.pl
-# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
-# module still have hidden potential [see TODO list there], which is
-# estimated to be larger than 20%...
-
-# int bn_mul_mont(
-$rp="%i0"; # BN_ULONG *rp,
-$ap="%i1"; # const BN_ULONG *ap,
-$bp="%i2"; # const BN_ULONG *bp,
-$np="%i3"; # const BN_ULONG *np,
-$n0="%i4"; # const BN_ULONG *n0,
-$num="%i5"; # int num);
-
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64) { $bias=2047; $frame=192; }
-else { $bias=0; $frame=128; }
-
-$car0="%o0";
-$car1="%o1";
-$car2="%o2"; # 1 bit
-$acc0="%o3";
-$acc1="%o4";
-$mask="%g1"; # 32 bits, what a waste...
-$tmp0="%g4";
-$tmp1="%g5";
-
-$i="%l0";
-$j="%l1";
-$mul0="%l2";
-$mul1="%l3";
-$tp="%l4";
-$apj="%l5";
-$npj="%l6";
-$tpj="%l7";
-
-$fname="bn_mul_mont_int";
-
-$code=<<___;
-.section ".text",#alloc,#execinstr
-
-.global $fname
-.align 32
-$fname:
- cmp %o5,4 ! 128 bits minimum
- bge,pt %icc,.Lenter
- sethi %hi(0xffffffff),$mask
- retl
- clr %o0
-.align 32
-.Lenter:
- save %sp,-$frame,%sp
- sll $num,2,$num ! num*=4
- or $mask,%lo(0xffffffff),$mask
- ld [$n0],$n0
- cmp $ap,$bp
- and $num,$mask,$num
- ld [$bp],$mul0 ! bp[0]
- nop
-
- add %sp,$bias,%o7 ! real top of stack
- ld [$ap],$car0 ! ap[0] ! redundant in squaring context
- sub %o7,$num,%o7
- ld [$ap+4],$apj ! ap[1]
- and %o7,-1024,%o7
- ld [$np],$car1 ! np[0]
- sub %o7,$bias,%sp ! alloca
- ld [$np+4],$npj ! np[1]
- be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
- mov 12,$j
-
- mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
- mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
- and $car0,$mask,$acc0
- add %sp,$bias+$frame,$tp
- ld [$ap+8],$apj !prologue!
-
- mulx $n0,$acc0,$mul1 ! "t[0]"*n0
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
- mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- ld [$np+8],$npj !prologue!
- srlx $car1,32,$car1
- mov $tmp0,$acc0 !prologue!
-
-.L1st:
- mulx $apj,$mul0,$tmp0
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- add $acc1,$car1,$car1
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- add $j,4,$j ! j++
- mov $tmp0,$acc0
- st $car1,[$tp]
- cmp $j,$num
- mov $tmp1,$acc1
- srlx $car1,32,$car1
- bl %icc,.L1st
- add $tp,4,$tp ! tp++
-!.L1st
-
- mulx $apj,$mul0,$tmp0 !epilogue!
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0
- and $car0,$mask,$acc0
- add $acc1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $tmp0,$car0,$car0
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car1
-
- add $car0,$car1,$car1
- st $car1,[$tp+8]
- srlx $car1,32,$car2
-
- mov 4,$i ! i++
- ld [$bp+4],$mul0 ! bp[1]
-.Louter:
- add %sp,$bias+$frame,$tp
- ld [$ap],$car0 ! ap[0]
- ld [$ap+4],$apj ! ap[1]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- ld [$tp],$tmp1 ! tp[0]
- ld [$tp+4],$tpj ! tp[1]
- mov 12,$j
-
- mulx $car0,$mul0,$car0
- mulx $apj,$mul0,$tmp0 !prologue!
- add $tmp1,$car0,$car0
- ld [$ap+8],$apj !prologue!
- and $car0,$mask,$acc0
-
- mulx $n0,$acc0,$mul1
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1
- mulx $npj,$mul1,$acc1 !prologue!
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- ld [$np+8],$npj !prologue!
- srlx $car1,32,$car1
- mov $tmp0,$acc0 !prologue!
-
-.Linner:
- mulx $apj,$mul0,$tmp0
- mulx $npj,$mul1,$tmp1
- add $tpj,$car0,$car0
- ld [$ap+$j],$apj ! ap[j]
- add $acc0,$car0,$car0
- add $acc1,$car1,$car1
- ld [$np+$j],$npj ! np[j]
- and $car0,$mask,$acc0
- ld [$tp+8],$tpj ! tp[j]
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- add $j,4,$j ! j++
- mov $tmp0,$acc0
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
- mov $tmp1,$acc1
- cmp $j,$num
- bl %icc,.Linner
- add $tp,4,$tp ! tp++
-!.Linner
-
- mulx $apj,$mul0,$tmp0 !epilogue!
- mulx $npj,$mul1,$tmp1
- add $tpj,$car0,$car0
- add $acc0,$car0,$car0
- ld [$tp+8],$tpj ! tp[j]
- and $car0,$mask,$acc0
- add $acc1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
-
- add $tpj,$car0,$car0
- add $tmp0,$car0,$car0
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- add $acc0,$car1,$car1
- st $car1,[$tp+4] ! tp[j-1]
- srlx $car0,32,$car0
- add $i,4,$i ! i++
- srlx $car1,32,$car1
-
- add $car0,$car1,$car1
- cmp $i,$num
- add $car2,$car1,$car1
- st $car1,[$tp+8]
-
- srlx $car1,32,$car2
- bl,a %icc,.Louter
- ld [$bp+$i],$mul0 ! bp[i]
-!.Louter
-
- add $tp,12,$tp
-
-.Ltail:
- add $np,$num,$np
- add $rp,$num,$rp
- mov $tp,$ap
- sub %g0,$num,%o7 ! k=-num
- ba .Lsub
- subcc %g0,%g0,%g0 ! clear %icc.c
-.align 16
-.Lsub:
- ld [$tp+%o7],%o0
- ld [$np+%o7],%o1
- subccc %o0,%o1,%o1 ! tp[j]-np[j]
- add $rp,%o7,$i
- add %o7,4,%o7
- brnz %o7,.Lsub
- st %o1,[$i]
- subc $car2,0,$car2 ! handle upmost overflow bit
- and $tp,$car2,$ap
- andn $rp,$car2,$np
- or $ap,$np,$ap
- sub %g0,$num,%o7
-
-.Lcopy:
- ld [$ap+%o7],%o0 ! copy or in-place refresh
- st %g0,[$tp+%o7] ! zap tp
- st %o0,[$rp+%o7]
- add %o7,4,%o7
- brnz %o7,.Lcopy
- nop
- mov 1,%i0
- ret
- restore
-___
-
-########
-######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
-######## code without following dedicated squaring procedure.
-########
-$sbit="%i2"; # re-use $bp!
-
-$code.=<<___;
-.align 32
-.Lbn_sqr_mont:
- mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
- mulx $apj,$mul0,$tmp0 !prologue!
- and $car0,$mask,$acc0
- add %sp,$bias+$frame,$tp
- ld [$ap+8],$apj !prologue!
-
- mulx $n0,$acc0,$mul1 ! "t[0]"*n0
- srlx $car0,32,$car0
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
- mulx $npj,$mul1,$acc1 !prologue!
- and $car0,1,$sbit
- ld [$np+8],$npj !prologue!
- srlx $car0,1,$car0
- add $acc0,$car1,$car1
- srlx $car1,32,$car1
- mov $tmp0,$acc0 !prologue!
-
-.Lsqr_1st:
- mulx $apj,$mul0,$tmp0
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0 ! ap[j]*a0+c0
- add $acc1,$car1,$car1
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- mov $tmp1,$acc1
- srlx $acc0,32,$sbit
- add $j,4,$j ! j++
- and $acc0,$mask,$acc0
- cmp $j,$num
- add $acc0,$car1,$car1
- st $car1,[$tp]
- mov $tmp0,$acc0
- srlx $car1,32,$car1
- bl %icc,.Lsqr_1st
- add $tp,4,$tp ! tp++
-!.Lsqr_1st
-
- mulx $apj,$mul0,$tmp0 ! epilogue
- mulx $npj,$mul1,$tmp1
- add $acc0,$car0,$car0 ! ap[j]*a0+c0
- add $acc1,$car1,$car1
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $tmp0,$car0,$car0 ! ap[j]*a0+c0
- add $tmp1,$car1,$car1
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- st $car1,[$tp+8]
- srlx $car1,32,$car2
-
- ld [%sp+$bias+$frame],$tmp0 ! tp[0]
- ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
- ld [%sp+$bias+$frame+8],$tpj ! tp[2]
- ld [$ap+4],$mul0 ! ap[1]
- ld [$ap+8],$apj ! ap[2]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- mulx $n0,$tmp0,$mul1
-
- mulx $mul0,$mul0,$car0
- and $mul1,$mask,$mul1
-
- mulx $car1,$mul1,$car1
- mulx $npj,$mul1,$acc1
- add $tmp0,$car1,$car1
- and $car0,$mask,$acc0
- ld [$np+8],$npj ! np[2]
- srlx $car1,32,$car1
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add $acc0,$car1,$car1
- and $car0,1,$sbit
- add $acc1,$car1,$car1
- srlx $car0,1,$car0
- mov 12,$j
- st $car1,[%sp+$bias+$frame] ! tp[0]=
- srlx $car1,32,$car1
- add %sp,$bias+$frame+4,$tp
-
-.Lsqr_2nd:
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $acc0,$car0,$car0
- add $tpj,$car1,$car1
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc1,$car1,$car1
- ld [$tp+8],$tpj ! tp[j]
- add $acc0,$acc0,$acc0
- add $j,4,$j ! j++
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- cmp $j,$num
- add $acc0,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_2nd
- add $tp,4,$tp ! tp++
-!.Lsqr_2nd
-
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $acc0,$car0,$car0
- add $tpj,$car1,$car1
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc1,$car1,$car1
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- add $car2,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car2
-
- ld [%sp+$bias+$frame],$tmp1 ! tp[0]
- ld [%sp+$bias+$frame+4],$tpj ! tp[1]
- ld [$ap+8],$mul0 ! ap[2]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- mulx $n0,$tmp1,$mul1
- and $mul1,$mask,$mul1
- mov 8,$i
-
- mulx $mul0,$mul0,$car0
- mulx $car1,$mul1,$car1
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add %sp,$bias+$frame,$tp
- srlx $car1,32,$car1
- and $car0,1,$sbit
- srlx $car0,1,$car0
- mov 4,$j
-
-.Lsqr_outer:
-.Lsqr_inner1:
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $j,4,$j
- ld [$tp+8],$tpj
- cmp $j,$i
- add $acc1,$car1,$car1
- ld [$np+$j],$npj
- st $car1,[$tp]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_inner1
- add $tp,4,$tp
-!.Lsqr_inner1
-
- add $j,4,$j
- ld [$ap+$j],$apj ! ap[j]
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- ld [$np+$j],$npj ! np[j]
- add $acc0,$car1,$car1
- ld [$tp+8],$tpj ! tp[j]
- add $acc1,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $j,4,$j
- cmp $j,$num
- be,pn %icc,.Lsqr_no_inner2
- add $tp,4,$tp
-
-.Lsqr_inner2:
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $acc0,$car0,$car0
- ld [$ap+$j],$apj ! ap[j]
- and $car0,$mask,$acc0
- ld [$np+$j],$npj ! np[j]
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- ld [$tp+8],$tpj ! tp[j]
- or $sbit,$acc0,$acc0
- add $j,4,$j ! j++
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- cmp $j,$num
- add $acc0,$car1,$car1
- add $acc1,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_inner2
- add $tp,4,$tp ! tp++
-
-.Lsqr_no_inner2:
- mulx $apj,$mul0,$acc0
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $acc0,$car0,$car0
- and $car0,$mask,$acc0
- srlx $car0,32,$car0
- add $acc0,$acc0,$acc0
- or $sbit,$acc0,$acc0
- srlx $acc0,32,$sbit
- and $acc0,$mask,$acc0
- add $acc0,$car1,$car1
- add $acc1,$car1,$car1
- st $car1,[$tp] ! tp[j-1]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- add $car2,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car2
-
- add $i,4,$i ! i++
- ld [%sp+$bias+$frame],$tmp1 ! tp[0]
- ld [%sp+$bias+$frame+4],$tpj ! tp[1]
- ld [$ap+$i],$mul0 ! ap[j]
- ld [$np],$car1 ! np[0]
- ld [$np+4],$npj ! np[1]
- mulx $n0,$tmp1,$mul1
- and $mul1,$mask,$mul1
- add $i,4,$tmp0
-
- mulx $mul0,$mul0,$car0
- mulx $car1,$mul1,$car1
- and $car0,$mask,$acc0
- add $tmp1,$car1,$car1
- srlx $car0,32,$car0
- add %sp,$bias+$frame,$tp
- srlx $car1,32,$car1
- and $car0,1,$sbit
- srlx $car0,1,$car0
-
- cmp $tmp0,$num ! i<num-1
- bl %icc,.Lsqr_outer
- mov 4,$j
-
-.Lsqr_last:
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $j,4,$j
- ld [$tp+8],$tpj
- cmp $j,$i
- add $acc1,$car1,$car1
- ld [$np+$j],$npj
- st $car1,[$tp]
- srlx $car1,32,$car1
- bl %icc,.Lsqr_last
- add $tp,4,$tp
-!.Lsqr_last
-
- mulx $npj,$mul1,$acc1
- add $tpj,$car1,$car1
- add $acc0,$car1,$car1
- add $acc1,$car1,$car1
- st $car1,[$tp]
- srlx $car1,32,$car1
-
- add $car0,$car0,$car0 ! recover $car0
- or $sbit,$car0,$car0
- add $car0,$car1,$car1
- add $car2,$car1,$car1
- st $car1,[$tp+4]
- srlx $car1,32,$car2
-
- ba .Ltail
- add $tp,8,$tp
-.type $fname,#function
-.size $fname,(.-$fname)
-.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
-.align 32
-___
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-print $code;
-close STDOUT;
diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl
deleted file mode 100755
index a14205f2f006..000000000000
--- a/crypto/bn/asm/sparcv9a-mont.pl
+++ /dev/null
@@ -1,882 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# October 2005
-#
-# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
-# Because unlike integer multiplier, which simply stalls whole CPU,
-# FPU is fully pipelined and can effectively emit 48 bit partial
-# product every cycle. Why not blended SPARC v9? One can argue that
-# making this module dependent on UltraSPARC VIS extension limits its
-# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
-# implementations from compatibility matrix. But the rest, whole Sun
-# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
-# VIS extension instructions used in this module. This is considered
-# good enough to not care about HAL SPARC64 users [if any] who have
-# integer-only pure SPARCv9 module to "fall down" to.
-
-# USI&II cores currently exhibit uniform 2x improvement [over pre-
-# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
-# performance improves few percents for shorter keys and worsens few
-# percents for longer keys. This is because USIII integer multiplier
-# is >3x faster than USI&II one, which is harder to match [but see
-# TODO list below]. It should also be noted that SPARC64 V features
-# out-of-order execution, which *might* mean that integer multiplier
-# is pipelined, which in turn *might* be impossible to match... On
-# additional note, SPARC64 V implements FP Multiply-Add instruction,
-# which is perfectly usable in this context... In other words, as far
-# as Fujitsu SPARC64 V goes, talk to the author:-)
-
-# The implementation implies following "non-natural" limitations on
-# input arguments:
-# - num may not be less than 4;
-# - num has to be even;
-# Failure to meet either condition has no fatal effects, simply
-# doesn't give any performance gain.
-
-# TODO:
-# - modulo-schedule inner loop for better performance (on in-order
-# execution core such as UltraSPARC this shall result in further
-# noticeable(!) improvement);
-# - dedicated squaring procedure[?];
-
-######################################################################
-# November 2006
-#
-# Modulo-scheduled inner loops allow to interleave floating point and
-# integer instructions and minimize Read-After-Write penalties. This
-# results in *further* 20-50% perfromance improvement [depending on
-# key length, more for longer keys] on USI&II cores and 30-80% - on
-# USIII&IV.
-
-$fname="bn_mul_mont_fpu";
-$bits=32;
-for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-
-if ($bits==64) {
- $bias=2047;
- $frame=192;
-} else {
- $bias=0;
- $frame=128; # 96 rounded up to largest known cache-line
-}
-$locals=64;
-
-# In order to provide for 32-/64-bit ABI duality, I keep integers wider
-# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
-# exclusively for pointers, indexes and other small values...
-# int bn_mul_mont(
-$rp="%i0"; # BN_ULONG *rp,
-$ap="%i1"; # const BN_ULONG *ap,
-$bp="%i2"; # const BN_ULONG *bp,
-$np="%i3"; # const BN_ULONG *np,
-$n0="%i4"; # const BN_ULONG *n0,
-$num="%i5"; # int num);
-
-$tp="%l0"; # t[num]
-$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
-$ap_h="%l2"; # to these four vectors as double-precision FP values.
-$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
-$np_h="%l4"; # loop and L1-cache aliasing is minimized...
-$i="%l5";
-$j="%l6";
-$mask="%l7"; # 16-bit mask, 0xffff
-
-$n0="%g4"; # reassigned(!) to "64-bit" register
-$carry="%i4"; # %i4 reused(!) for a carry bit
-
-# FP register naming chart
-#
-# ..HILO
-# dcba
-# --------
-# LOa
-# LOb
-# LOc
-# LOd
-# HIa
-# HIb
-# HIc
-# HId
-# ..a
-# ..b
-$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
-$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
-$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
-$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
-
-$dota="%f24"; $dotb="%f26";
-
-$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
-$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
-$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
-$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
-
-$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
-
-$code=<<___;
-.section ".text",#alloc,#execinstr
-
-.global $fname
-.align 32
-$fname:
- save %sp,-$frame-$locals,%sp
-
- cmp $num,4
- bl,a,pn %icc,.Lret
- clr %i0
- andcc $num,1,%g0 ! $num has to be even...
- bnz,a,pn %icc,.Lret
- clr %i0 ! signal "unsupported input value"
-
- srl $num,1,$num
- sethi %hi(0xffff),$mask
- ld [%i4+0],$n0 ! $n0 reassigned, remember?
- or $mask,%lo(0xffff),$mask
- ld [%i4+4],%o0
- sllx %o0,32,%o0
- or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
-
- sll $num,3,$num ! num*=8
-
- add %sp,$bias,%o0 ! real top of stack
- sll $num,2,%o1
- add %o1,$num,%o1 ! %o1=num*5
- sub %o0,%o1,%o0
- and %o0,-2048,%o0 ! optimize TLB utilization
- sub %o0,$bias,%sp ! alloca(5*num*8)
-
- rd %asi,%o7 ! save %asi
- add %sp,$bias+$frame+$locals,$tp
- add $tp,$num,$ap_l
- add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
- add $ap_l,$num,$ap_h
- add $ap_h,$num,$np_l
- add $np_l,$num,$np_h
-
- wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
-
- add $rp,$num,$rp ! readjust input pointers to point
- add $ap,$num,$ap ! at the ends too...
- add $bp,$num,$bp
- add $np,$num,$np
-
- stx %o7,[%sp+$bias+$frame+48] ! save %asi
-
- sub %g0,$num,$i ! i=-num
- sub %g0,$num,$j ! j=-num
-
- add $ap,$j,%o3
- add $bp,$i,%o4
-
- ld [%o3+4],%g1 ! bp[0]
- ld [%o3+0],%o0
- ld [%o4+4],%g5 ! ap[0]
- sllx %g1,32,%g1
- ld [%o4+0],%o1
- sllx %g5,32,%g5
- or %g1,%o0,%o0
- or %g5,%o1,%o1
-
- add $np,$j,%o5
-
- mulx %o1,%o0,%o0 ! ap[0]*bp[0]
- mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
- stx %o0,[%sp+$bias+$frame+0]
-
- ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
- fzeros $alo
- ld [%o3+4],$ahi_
- fzeros $ahi
- ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
- fzeros $nlo
- ld [%o5+4],$nhi_
- fzeros $nhi
-
- ! transfer b[i] to FPU as 4x16-bit values
- ldda [%o4+2]%asi,$ba
- fxtod $alo,$alo
- ldda [%o4+0]%asi,$bb
- fxtod $ahi,$ahi
- ldda [%o4+6]%asi,$bc
- fxtod $nlo,$nlo
- ldda [%o4+4]%asi,$bd
- fxtod $nhi,$nhi
-
- ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
- ldda [%sp+$bias+$frame+6]%asi,$na
- fxtod $ba,$ba
- ldda [%sp+$bias+$frame+4]%asi,$nb
- fxtod $bb,$bb
- ldda [%sp+$bias+$frame+2]%asi,$nc
- fxtod $bc,$bc
- ldda [%sp+$bias+$frame+0]%asi,$nd
- fxtod $bd,$bd
-
- std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
- fxtod $na,$na
- std $ahi,[$ap_h+$j]
- fxtod $nb,$nb
- std $nlo,[$np_l+$j] ! save smashed np[j] in double format
- fxtod $nc,$nc
- std $nhi,[$np_h+$j]
- fxtod $nd,$nd
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- fmuld $alo,$bd,$alod
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- fmuld $ahi,$ba,$ahia
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- fmuld $ahi,$bb,$ahib
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- fmuld $ahi,$bc,$ahic
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- fmuld $ahi,$bd,$ahid
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
-
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- add $j,8,$j
- std $nlob,[%sp+$bias+$frame+8]
- add $ap,$j,%o4
- std $nloc,[%sp+$bias+$frame+16]
- add $np,$j,%o5
- std $nlod,[%sp+$bias+$frame+24]
-
- ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
- fzeros $alo
- ld [%o4+4],$ahi_
- fzeros $ahi
- ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
- fzeros $nlo
- ld [%o5+4],$nhi_
- fzeros $nhi
-
- fxtod $alo,$alo
- fxtod $ahi,$ahi
- fxtod $nlo,$nlo
- fxtod $nhi,$nhi
-
- ldx [%sp+$bias+$frame+0],%o0
- fmuld $alo,$ba,$aloa
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $nlo,$na,$nloa
- ldx [%sp+$bias+$frame+16],%o2
- fmuld $alo,$bb,$alob
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $nlo,$nb,$nlob
-
- srlx %o0,16,%o7
- std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
- fmuld $alo,$bc,$aloc
- add %o7,%o1,%o1
- std $ahi,[$ap_h+$j]
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- srlx %o1,16,%o7
- std $nlo,[$np_l+$j] ! save smashed np[j] in double format
- fmuld $alo,$bd,$alod
- add %o7,%o2,%o2
- std $nhi,[$np_h+$j]
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- srlx %o2,16,%o7
- fmuld $ahi,$ba,$ahia
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- !and %o0,$mask,%o0
- !and %o1,$mask,%o1
- !and %o2,$mask,%o2
- !sllx %o1,16,%o1
- !sllx %o2,32,%o2
- !sllx %o3,48,%o7
- !or %o1,%o0,%o0
- !or %o2,%o0,%o0
- !or %o7,%o0,%o0 ! 64-bit result
- srlx %o3,16,%g1 ! 34-bit carry
- fmuld $ahi,$bb,$ahib
-
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- fmuld $ahi,$bc,$ahic
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- fmuld $ahi,$bd,$ahid
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
-
- faddd $dota,$nloa,$nloa
- faddd $dotb,$nlob,$nlob
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- addcc $j,8,$j
- std $nloc,[%sp+$bias+$frame+16]
- bz,pn %icc,.L1stskip
- std $nlod,[%sp+$bias+$frame+24]
-
-.align 32 ! incidentally already aligned !
-.L1st:
- add $ap,$j,%o4
- add $np,$j,%o5
- ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
- fzeros $alo
- ld [%o4+4],$ahi_
- fzeros $ahi
- ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
- fzeros $nlo
- ld [%o5+4],$nhi_
- fzeros $nhi
-
- fxtod $alo,$alo
- fxtod $ahi,$ahi
- fxtod $nlo,$nlo
- fxtod $nhi,$nhi
-
- ldx [%sp+$bias+$frame+0],%o0
- fmuld $alo,$ba,$aloa
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $nlo,$na,$nloa
- ldx [%sp+$bias+$frame+16],%o2
- fmuld $alo,$bb,$alob
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $nlo,$nb,$nlob
-
- srlx %o0,16,%o7
- std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
- fmuld $alo,$bc,$aloc
- add %o7,%o1,%o1
- std $ahi,[$ap_h+$j]
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- srlx %o1,16,%o7
- std $nlo,[$np_l+$j] ! save smashed np[j] in double format
- fmuld $alo,$bd,$alod
- add %o7,%o2,%o2
- std $nhi,[$np_h+$j]
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- srlx %o2,16,%o7
- fmuld $ahi,$ba,$ahia
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- fmuld $ahi,$bb,$ahib
- sllx %o1,16,%o1
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- sllx %o2,32,%o2
- fmuld $ahi,$bc,$ahic
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- or %o2,%o0,%o0
- fmuld $ahi,$bd,$ahid
- or %o7,%o0,%o0 ! 64-bit result
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
- addcc %g1,%o0,%o0
- faddd $dota,$nloa,$nloa
- srlx %o3,16,%g1 ! 34-bit carry
- faddd $dotb,$nlob,$nlob
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]=
-
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- std $nloc,[%sp+$bias+$frame+16]
- std $nlod,[%sp+$bias+$frame+24]
-
- addcc $j,8,$j
- bnz,pt %icc,.L1st
- add $tp,8,$tp
-
-.L1stskip:
- fdtox $dota,$dota
- fdtox $dotb,$dotb
-
- ldx [%sp+$bias+$frame+0],%o0
- ldx [%sp+$bias+$frame+8],%o1
- ldx [%sp+$bias+$frame+16],%o2
- ldx [%sp+$bias+$frame+24],%o3
-
- srlx %o0,16,%o7
- std $dota,[%sp+$bias+$frame+32]
- add %o7,%o1,%o1
- std $dotb,[%sp+$bias+$frame+40]
- srlx %o1,16,%o7
- add %o7,%o2,%o2
- srlx %o2,16,%o7
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- sllx %o1,16,%o1
- sllx %o2,32,%o2
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- or %o2,%o0,%o0
- or %o7,%o0,%o0 ! 64-bit result
- ldx [%sp+$bias+$frame+32],%o4
- addcc %g1,%o0,%o0
- ldx [%sp+$bias+$frame+40],%o5
- srlx %o3,16,%g1 ! 34-bit carry
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]=
- add $tp,8,$tp
-
- srlx %o4,16,%o7
- add %o7,%o5,%o5
- and %o4,$mask,%o4
- sllx %o5,16,%o7
- or %o7,%o4,%o4
- addcc %g1,%o4,%o4
- srlx %o5,48,%g1
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- mov %g1,$carry
- stx %o4,[$tp] ! tp[num-1]=
-
- ba .Louter
- add $i,8,$i
-.align 32
-.Louter:
- sub %g0,$num,$j ! j=-num
- add %sp,$bias+$frame+$locals,$tp
-
- add $ap,$j,%o3
- add $bp,$i,%o4
-
- ld [%o3+4],%g1 ! bp[i]
- ld [%o3+0],%o0
- ld [%o4+4],%g5 ! ap[0]
- sllx %g1,32,%g1
- ld [%o4+0],%o1
- sllx %g5,32,%g5
- or %g1,%o0,%o0
- or %g5,%o1,%o1
-
- ldx [$tp],%o2 ! tp[0]
- mulx %o1,%o0,%o0
- addcc %o2,%o0,%o0
- mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
- stx %o0,[%sp+$bias+$frame+0]
-
- ! transfer b[i] to FPU as 4x16-bit values
- ldda [%o4+2]%asi,$ba
- ldda [%o4+0]%asi,$bb
- ldda [%o4+6]%asi,$bc
- ldda [%o4+4]%asi,$bd
-
- ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
- ldda [%sp+$bias+$frame+6]%asi,$na
- fxtod $ba,$ba
- ldda [%sp+$bias+$frame+4]%asi,$nb
- fxtod $bb,$bb
- ldda [%sp+$bias+$frame+2]%asi,$nc
- fxtod $bc,$bc
- ldda [%sp+$bias+$frame+0]%asi,$nd
- fxtod $bd,$bd
- ldd [$ap_l+$j],$alo ! load a[j] in double format
- fxtod $na,$na
- ldd [$ap_h+$j],$ahi
- fxtod $nb,$nb
- ldd [$np_l+$j],$nlo ! load n[j] in double format
- fxtod $nc,$nc
- ldd [$np_h+$j],$nhi
- fxtod $nd,$nd
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- fmuld $alo,$bd,$alod
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- fmuld $ahi,$ba,$ahia
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- fmuld $ahi,$bb,$ahib
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- fmuld $ahi,$bc,$ahic
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- fmuld $ahi,$bd,$ahid
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
-
- faddd $ahic,$nhic,$dota ! $nhic
- faddd $ahid,$nhid,$dotb ! $nhid
-
- faddd $nloc,$nhia,$nloc
- faddd $nlod,$nhib,$nlod
-
- fdtox $nloa,$nloa
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- std $nloc,[%sp+$bias+$frame+16]
- add $j,8,$j
- std $nlod,[%sp+$bias+$frame+24]
-
- ldd [$ap_l+$j],$alo ! load a[j] in double format
- ldd [$ap_h+$j],$ahi
- ldd [$np_l+$j],$nlo ! load n[j] in double format
- ldd [$np_h+$j],$nhi
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- ldx [%sp+$bias+$frame+0],%o0
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $alo,$bd,$alod
- ldx [%sp+$bias+$frame+16],%o2
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $ahi,$ba,$ahia
-
- srlx %o0,16,%o7
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- add %o7,%o1,%o1
- fmuld $ahi,$bb,$ahib
- srlx %o1,16,%o7
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- add %o7,%o2,%o2
- fmuld $ahi,$bc,$ahic
- srlx %o2,16,%o7
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- ! why?
- and %o0,$mask,%o0
- fmuld $ahi,$bd,$ahid
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
- sllx %o1,16,%o1
- faddd $dota,$nloa,$nloa
- sllx %o2,32,%o2
- faddd $dotb,$nlob,$nlob
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- faddd $ahic,$nhic,$dota ! $nhic
- or %o2,%o0,%o0
- faddd $ahid,$nhid,$dotb ! $nhid
- or %o7,%o0,%o0 ! 64-bit result
- ldx [$tp],%o7
- faddd $nloc,$nhia,$nloc
- addcc %o7,%o0,%o0
- ! end-of-why?
- faddd $nlod,$nhib,$nlod
- srlx %o3,16,%g1 ! 34-bit carry
- fdtox $nloa,$nloa
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- fdtox $nlob,$nlob
- fdtox $nloc,$nloc
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- addcc $j,8,$j
- std $nloc,[%sp+$bias+$frame+16]
- bz,pn %icc,.Linnerskip
- std $nlod,[%sp+$bias+$frame+24]
-
- ba .Linner
- nop
-.align 32
-.Linner:
- ldd [$ap_l+$j],$alo ! load a[j] in double format
- ldd [$ap_h+$j],$ahi
- ldd [$np_l+$j],$nlo ! load n[j] in double format
- ldd [$np_h+$j],$nhi
-
- fmuld $alo,$ba,$aloa
- fmuld $nlo,$na,$nloa
- fmuld $alo,$bb,$alob
- fmuld $nlo,$nb,$nlob
- fmuld $alo,$bc,$aloc
- ldx [%sp+$bias+$frame+0],%o0
- faddd $aloa,$nloa,$nloa
- fmuld $nlo,$nc,$nloc
- ldx [%sp+$bias+$frame+8],%o1
- fmuld $alo,$bd,$alod
- ldx [%sp+$bias+$frame+16],%o2
- faddd $alob,$nlob,$nlob
- fmuld $nlo,$nd,$nlod
- ldx [%sp+$bias+$frame+24],%o3
- fmuld $ahi,$ba,$ahia
-
- srlx %o0,16,%o7
- faddd $aloc,$nloc,$nloc
- fmuld $nhi,$na,$nhia
- add %o7,%o1,%o1
- fmuld $ahi,$bb,$ahib
- srlx %o1,16,%o7
- faddd $alod,$nlod,$nlod
- fmuld $nhi,$nb,$nhib
- add %o7,%o2,%o2
- fmuld $ahi,$bc,$ahic
- srlx %o2,16,%o7
- faddd $ahia,$nhia,$nhia
- fmuld $nhi,$nc,$nhic
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- fmuld $ahi,$bd,$ahid
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- faddd $ahib,$nhib,$nhib
- fmuld $nhi,$nd,$nhid
- sllx %o1,16,%o1
- faddd $dota,$nloa,$nloa
- sllx %o2,32,%o2
- faddd $dotb,$nlob,$nlob
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- faddd $ahic,$nhic,$dota ! $nhic
- or %o2,%o0,%o0
- faddd $ahid,$nhid,$dotb ! $nhid
- or %o7,%o0,%o0 ! 64-bit result
- faddd $nloc,$nhia,$nloc
- addcc %g1,%o0,%o0
- ldx [$tp+8],%o7 ! tp[j]
- faddd $nlod,$nhib,$nlod
- srlx %o3,16,%g1 ! 34-bit carry
- fdtox $nloa,$nloa
- bcs,a %xcc,.+8
- add %g1,1,%g1
- fdtox $nlob,$nlob
- addcc %o7,%o0,%o0
- fdtox $nloc,$nloc
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]
- fdtox $nlod,$nlod
-
- std $nloa,[%sp+$bias+$frame+0]
- std $nlob,[%sp+$bias+$frame+8]
- std $nloc,[%sp+$bias+$frame+16]
- addcc $j,8,$j
- std $nlod,[%sp+$bias+$frame+24]
- bnz,pt %icc,.Linner
- add $tp,8,$tp
-
-.Linnerskip:
- fdtox $dota,$dota
- fdtox $dotb,$dotb
-
- ldx [%sp+$bias+$frame+0],%o0
- ldx [%sp+$bias+$frame+8],%o1
- ldx [%sp+$bias+$frame+16],%o2
- ldx [%sp+$bias+$frame+24],%o3
-
- srlx %o0,16,%o7
- std $dota,[%sp+$bias+$frame+32]
- add %o7,%o1,%o1
- std $dotb,[%sp+$bias+$frame+40]
- srlx %o1,16,%o7
- add %o7,%o2,%o2
- srlx %o2,16,%o7
- add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
- and %o0,$mask,%o0
- and %o1,$mask,%o1
- and %o2,$mask,%o2
- sllx %o1,16,%o1
- sllx %o2,32,%o2
- sllx %o3,48,%o7
- or %o1,%o0,%o0
- or %o2,%o0,%o0
- ldx [%sp+$bias+$frame+32],%o4
- or %o7,%o0,%o0 ! 64-bit result
- ldx [%sp+$bias+$frame+40],%o5
- addcc %g1,%o0,%o0
- ldx [$tp+8],%o7 ! tp[j]
- srlx %o3,16,%g1 ! 34-bit carry
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- addcc %o7,%o0,%o0
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- stx %o0,[$tp] ! tp[j-1]
- add $tp,8,$tp
-
- srlx %o4,16,%o7
- add %o7,%o5,%o5
- and %o4,$mask,%o4
- sllx %o5,16,%o7
- or %o7,%o4,%o4
- addcc %g1,%o4,%o4
- srlx %o5,48,%g1
- bcs,a %xcc,.+8
- add %g1,1,%g1
-
- addcc $carry,%o4,%o4
- stx %o4,[$tp] ! tp[num-1]
- mov %g1,$carry
- bcs,a %xcc,.+8
- add $carry,1,$carry
-
- addcc $i,8,$i
- bnz %icc,.Louter
- nop
-
- add $tp,8,$tp ! adjust tp to point at the end
- orn %g0,%g0,%g4
- sub %g0,$num,%o7 ! n=-num
- ba .Lsub
- subcc %g0,%g0,%g0 ! clear %icc.c
-
-.align 32
-.Lsub:
- ldx [$tp+%o7],%o0
- add $np,%o7,%g1
- ld [%g1+0],%o2
- ld [%g1+4],%o3
- srlx %o0,32,%o1
- subccc %o0,%o2,%o2
- add $rp,%o7,%g1
- subccc %o1,%o3,%o3
- st %o2,[%g1+0]
- add %o7,8,%o7
- brnz,pt %o7,.Lsub
- st %o3,[%g1+4]
- subc $carry,0,%g4
- sub %g0,$num,%o7 ! n=-num
- ba .Lcopy
- nop
-
-.align 32
-.Lcopy:
- ldx [$tp+%o7],%o0
- add $rp,%o7,%g1
- ld [%g1+0],%o2
- ld [%g1+4],%o3
- stx %g0,[$tp+%o7]
- and %o0,%g4,%o0
- srlx %o0,32,%o1
- andn %o2,%g4,%o2
- andn %o3,%g4,%o3
- or %o2,%o0,%o0
- or %o3,%o1,%o1
- st %o0,[%g1+0]
- add %o7,8,%o7
- brnz,pt %o7,.Lcopy
- st %o1,[%g1+4]
- sub %g0,$num,%o7 ! n=-num
-
-.Lzap:
- stx %g0,[$ap_l+%o7]
- stx %g0,[$ap_h+%o7]
- stx %g0,[$np_l+%o7]
- stx %g0,[$np_h+%o7]
- add %o7,8,%o7
- brnz,pt %o7,.Lzap
- nop
-
- ldx [%sp+$bias+$frame+48],%o7
- wr %g0,%o7,%asi ! restore %asi
-
- mov 1,%i0
-.Lret:
- ret
- restore
-.type $fname,#function
-.size $fname,(.-$fname)
-.asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
-.align 32
-___
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-# Below substitution makes it possible to compile without demanding
-# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
-# dare to do this, because VIS capability is detected at run-time now
-# and this routine is not called on CPU not capable to execute it. Do
-# note that fzeros is not the only VIS dependency! Another dependency
-# is implicit and is just _a_ numerical value loaded to %asi register,
-# which assembler can't recognize as VIS specific...
-$code =~ s/fzeros\s+%f([0-9]+)/
- sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
- /gem;
-
-print $code;
-# flush
-close STDOUT;
diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl
deleted file mode 100755
index c046a514c873..000000000000
--- a/crypto/bn/asm/via-mont.pl
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# Wrapper around 'rep montmul', VIA-specific instruction accessing
-# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
-# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
-#
-# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
-# different software configurations on 1.5GHz VIA Esther processor.
-# Lines marked with "software integer" denote performance of hand-
-# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
-# refers to hand-coded SSE2 Montgomery multiplication procedure found
-# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
-# Padlock SDK 2.0.1 available for download from VIA, which naturally
-# utilizes the magic 'repz montmul' instruction. And finally "hardware
-# this" refers to *this* implementation which also uses 'repz montmul'
-#
-# sign verify sign/s verify/s
-# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
-# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
-# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
-# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
-#
-# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
-# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
-# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
-# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
-#
-# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
-# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
-# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
-# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
-#
-# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
-# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
-# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
-# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
-#
-# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
-# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
-# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
-# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
-#
-# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
-# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
-# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
-# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
-#
-# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
-# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
-# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
-# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
-#
-# To give you some other reference point here is output for 2.4GHz P4
-# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
-# SSE2" in above terms.
-#
-# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
-# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
-# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
-# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
-# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
-# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
-# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
-#
-# Conclusions:
-# - VIA SDK leaves a *lot* of room for improvement (which this
-# implementation successfully fills:-);
-# - 'rep montmul' gives up to >3x performance improvement depending on
-# key length;
-# - in terms of absolute performance it delivers approximately as much
-# as modern out-of-order 32-bit cores [again, for longer keys].
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"via-mont.pl");
-
-# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
-$func="bn_mul_mont_padlock";
-
-$pad=16*1; # amount of reserved bytes on top of every vector
-
-# stack layout
-$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
-$A=&DWP(4,"esp");
-$B=&DWP(8,"esp");
-$T=&DWP(12,"esp");
-$M=&DWP(16,"esp");
-$scratch=&DWP(20,"esp");
-$rp=&DWP(24,"esp"); # these are mine
-$sp=&DWP(28,"esp");
-# &DWP(32,"esp") # 32 byte scratch area
-# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
-# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
-# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
-# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
-# Note that SDK suggests to unconditionally allocate 2K per vector. This
-# has quite an impact on performance. It naturally depends on key length,
-# but to give an example 1024 bit private RSA key operations suffer >30%
-# penalty. I allocate only as much as actually required...
-
-&function_begin($func);
- &xor ("eax","eax");
- &mov ("ecx",&wparam(5)); # num
- # meet VIA's limitations for num [note that the specification
- # expresses them in bits, while we work with amount of 32-bit words]
- &test ("ecx",3);
- &jnz (&label("leave")); # num % 4 != 0
- &cmp ("ecx",8);
- &jb (&label("leave")); # num < 8
- &cmp ("ecx",1024);
- &ja (&label("leave")); # num > 1024
-
- &pushf ();
- &cld ();
-
- &mov ("edi",&wparam(0)); # rp
- &mov ("eax",&wparam(1)); # ap
- &mov ("ebx",&wparam(2)); # bp
- &mov ("edx",&wparam(3)); # np
- &mov ("esi",&wparam(4)); # n0
- &mov ("esi",&DWP(0,"esi")); # *n0
-
- &lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
- &lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
- &neg ("ebp");
- &add ("ebp","esp");
- &and ("ebp",-64); # align to cache-line
- &xchg ("ebp","esp"); # alloca
-
- &mov ($rp,"edi"); # save rp
- &mov ($sp,"ebp"); # save esp
-
- &mov ($mZeroPrime,"esi");
- &lea ("esi",&DWP(64,"esp")); # tp
- &mov ($T,"esi");
- &lea ("edi",&DWP(32,"esp")); # scratch area
- &mov ($scratch,"edi");
- &mov ("esi","eax");
-
- &lea ("ebp",&DWP(-$pad,"ecx"));
- &shr ("ebp",2); # restore original num value in ebp
-
- &xor ("eax","eax");
-
- &mov ("ecx","ebp");
- &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
- &data_byte(0xf3,0xab); # rep stosl, bzero
-
- &mov ("ecx","ebp");
- &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
- &mov ($A,"edi");
- &data_byte(0xf3,0xa5); # rep movsl, memcpy
- &mov ("ecx",$pad/4);
- &data_byte(0xf3,0xab); # rep stosl, bzero pad
- # edi points at the end of padded ap copy...
-
- &mov ("ecx","ebp");
- &mov ("esi","ebx");
- &mov ($B,"edi");
- &data_byte(0xf3,0xa5); # rep movsl, memcpy
- &mov ("ecx",$pad/4);
- &data_byte(0xf3,0xab); # rep stosl, bzero pad
- # edi points at the end of padded bp copy...
-
- &mov ("ecx","ebp");
- &mov ("esi","edx");
- &mov ($M,"edi");
- &data_byte(0xf3,0xa5); # rep movsl, memcpy
- &mov ("ecx",$pad/4);
- &data_byte(0xf3,0xab); # rep stosl, bzero pad
- # edi points at the end of padded np copy...
-
- # let magic happen...
- &mov ("ecx","ebp");
- &mov ("esi","esp");
- &shl ("ecx",5); # convert word counter to bit counter
- &align (4);
- &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
-
- &mov ("ecx","ebp");
- &lea ("esi",&DWP(64,"esp")); # tp
- # edi still points at the end of padded np copy...
- &neg ("ebp");
- &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
- &mov ("edi",$rp); # restore rp
- &xor ("edx","edx"); # i=0 and clear CF
-
-&set_label("sub",8);
- &mov ("eax",&DWP(0,"esi","edx",4));
- &sbb ("eax",&DWP(0,"ebp","edx",4));
- &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
- &lea ("edx",&DWP(1,"edx")); # i++
- &loop (&label("sub")); # doesn't affect CF!
-
- &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
- &sbb ("eax",0);
- &and ("esi","eax");
- &not ("eax");
- &mov ("ebp","edi");
- &and ("ebp","eax");
- &or ("esi","ebp"); # tp=carry?tp:rp
-
- &mov ("ecx","edx"); # num
- &xor ("edx","edx"); # i=0
-
-&set_label("copy",8);
- &mov ("eax",&DWP(0,"esi","edx",4));
- &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
- &mov (&DWP(0,"edi","edx",4),"eax");
- &lea ("edx",&DWP(1,"edx")); # i++
- &loop (&label("copy"));
-
- &mov ("ebp",$sp);
- &xor ("eax","eax");
-
- &mov ("ecx",64/4);
- &mov ("edi","esp"); # zap frame including scratch area
- &data_byte(0xf3,0xab); # rep stosl, bzero
-
- # zap copies of ap, bp and np
- &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
- &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
- &data_byte(0xf3,0xab); # rep stosl, bzero
-
- &mov ("esp","ebp");
- &inc ("eax"); # signal "done"
- &popf ();
-&set_label("leave");
-&function_end($func);
-
-&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
deleted file mode 100755
index 5cd3cd2ed50a..000000000000
--- a/crypto/bn/asm/x86-mont.pl
+++ /dev/null
@@ -1,591 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# October 2005
-#
-# This is a "teaser" code, as it can be improved in several ways...
-# First of all non-SSE2 path should be implemented (yes, for now it
-# performs Montgomery multiplication/convolution only on SSE2-capable
-# CPUs such as P4, others fall down to original code). Then inner loop
-# can be unrolled and modulo-scheduled to improve ILP and possibly
-# moved to 128-bit XMM register bank (though it would require input
-# rearrangement and/or increase bus bandwidth utilization). Dedicated
-# squaring procedure should give further performance improvement...
-# Yet, for being draft, the code improves rsa512 *sign* benchmark by
-# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
-
-# December 2006
-#
-# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
-# Integer-only code [being equipped with dedicated squaring procedure]
-# gives ~40% on rsa512 sign benchmark...
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],$0);
-
-$sse2=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-&external_label("OPENSSL_ia32cap_P") if ($sse2);
-
-&function_begin("bn_mul_mont");
-
-$i="edx";
-$j="ecx";
-$ap="esi"; $tp="esi"; # overlapping variables!!!
-$rp="edi"; $bp="edi"; # overlapping variables!!!
-$np="ebp";
-$num="ebx";
-
-$_num=&DWP(4*0,"esp"); # stack top layout
-$_rp=&DWP(4*1,"esp");
-$_ap=&DWP(4*2,"esp");
-$_bp=&DWP(4*3,"esp");
-$_np=&DWP(4*4,"esp");
-$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
-$_sp=&DWP(4*6,"esp");
-$_bpend=&DWP(4*7,"esp");
-$frame=32; # size of above frame rounded up to 16n
-
- &xor ("eax","eax");
- &mov ("edi",&wparam(5)); # int num
- &cmp ("edi",4);
- &jl (&label("just_leave"));
-
- &lea ("esi",&wparam(0)); # put aside pointer to argument block
- &lea ("edx",&wparam(1)); # load ap
- &mov ("ebp","esp"); # saved stack pointer!
- &add ("edi",2); # extra two words on top of tp
- &neg ("edi");
- &lea ("esp",&DWP(-$frame,"esp","edi",4)); # alloca($frame+4*(num+2))
- &neg ("edi");
-
- # minimize cache contention by arraning 2K window between stack
- # pointer and ap argument [np is also position sensitive vector,
- # but it's assumed to be near ap, as it's allocated at ~same
- # time].
- &mov ("eax","esp");
- &sub ("eax","edx");
- &and ("eax",2047);
- &sub ("esp","eax"); # this aligns sp and ap modulo 2048
-
- &xor ("edx","esp");
- &and ("edx",2048);
- &xor ("edx",2048);
- &sub ("esp","edx"); # this splits them apart modulo 4096
-
- &and ("esp",-64); # align to cache line
-
- ################################# load argument block...
- &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
- &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
- &mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
- &mov ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
- &mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
- #&mov ("edi",&DWP(5*4,"esi"));# int num
-
- &mov ("esi",&DWP(0,"esi")); # pull n0[0]
- &mov ($_rp,"eax"); # ... save a copy of argument block
- &mov ($_ap,"ebx");
- &mov ($_bp,"ecx");
- &mov ($_np,"edx");
- &mov ($_n0,"esi");
- &lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
- #&mov ($_num,$num); # redundant as $num is not reused
- &mov ($_sp,"ebp"); # saved stack pointer!
-
-if($sse2) {
-$acc0="mm0"; # mmx register bank layout
-$acc1="mm1";
-$car0="mm2";
-$car1="mm3";
-$mul0="mm4";
-$mul1="mm5";
-$temp="mm6";
-$mask="mm7";
-
- &picmeup("eax","OPENSSL_ia32cap_P");
- &bt (&DWP(0,"eax"),26);
- &jnc (&label("non_sse2"));
-
- &mov ("eax",-1);
- &movd ($mask,"eax"); # mask 32 lower bits
-
- &mov ($ap,$_ap); # load input pointers
- &mov ($bp,$_bp);
- &mov ($np,$_np);
-
- &xor ($i,$i); # i=0
- &xor ($j,$j); # j=0
-
- &movd ($mul0,&DWP(0,$bp)); # bp[0]
- &movd ($mul1,&DWP(0,$ap)); # ap[0]
- &movd ($car1,&DWP(0,$np)); # np[0]
-
- &pmuludq($mul1,$mul0); # ap[0]*bp[0]
- &movq ($car0,$mul1);
- &movq ($acc0,$mul1); # I wish movd worked for
- &pand ($acc0,$mask); # inter-register transfers
-
- &pmuludq($mul1,$_n0q); # *=n0
-
- &pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
- &paddq ($car1,$acc0);
-
- &movd ($acc1,&DWP(4,$np)); # np[1]
- &movd ($acc0,&DWP(4,$ap)); # ap[1]
-
- &psrlq ($car0,32);
- &psrlq ($car1,32);
-
- &inc ($j); # j++
-&set_label("1st",16);
- &pmuludq($acc0,$mul0); # ap[j]*bp[0]
- &pmuludq($acc1,$mul1); # np[j]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &pand ($acc0,$mask);
- &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
- &paddq ($car1,$acc0); # +=ap[j]*bp[0];
- &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
- &psrlq ($car0,32);
- &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
- &psrlq ($car1,32);
-
- &lea ($j,&DWP(1,$j));
- &cmp ($j,$num);
- &jl (&label("1st"));
-
- &pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
- &pmuludq($acc1,$mul1); # np[num-1]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &pand ($acc0,$mask);
- &paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
- &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
-
- &psrlq ($car0,32);
- &psrlq ($car1,32);
-
- &paddq ($car1,$car0);
- &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
-
- &inc ($i); # i++
-&set_label("outer");
- &xor ($j,$j); # j=0
-
- &movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
- &movd ($mul1,&DWP(0,$ap)); # ap[0]
- &movd ($temp,&DWP($frame,"esp")); # tp[0]
- &movd ($car1,&DWP(0,$np)); # np[0]
- &pmuludq($mul1,$mul0); # ap[0]*bp[i]
-
- &paddq ($mul1,$temp); # +=tp[0]
- &movq ($acc0,$mul1);
- &movq ($car0,$mul1);
- &pand ($acc0,$mask);
-
- &pmuludq($mul1,$_n0q); # *=n0
-
- &pmuludq($car1,$mul1);
- &paddq ($car1,$acc0);
-
- &movd ($temp,&DWP($frame+4,"esp")); # tp[1]
- &movd ($acc1,&DWP(4,$np)); # np[1]
- &movd ($acc0,&DWP(4,$ap)); # ap[1]
-
- &psrlq ($car0,32);
- &psrlq ($car1,32);
- &paddq ($car0,$temp); # +=tp[1]
-
- &inc ($j); # j++
- &dec ($num);
-&set_label("inner");
- &pmuludq($acc0,$mul0); # ap[j]*bp[i]
- &pmuludq($acc1,$mul1); # np[j]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
- &pand ($acc0,$mask);
- &movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
- &paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
- &movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
- &psrlq ($car0,32);
- &movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
- &psrlq ($car1,32);
- &paddq ($car0,$temp); # +=tp[j+1]
-
- &dec ($num);
- &lea ($j,&DWP(1,$j)); # j++
- &jnz (&label("inner"));
-
- &mov ($num,$j);
- &pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
- &pmuludq($acc1,$mul1); # np[num-1]*m1
- &paddq ($car0,$acc0); # +=c0
- &paddq ($car1,$acc1); # +=c1
-
- &movq ($acc0,$car0);
- &pand ($acc0,$mask);
- &paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
- &movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
- &psrlq ($car0,32);
- &psrlq ($car1,32);
-
- &movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
- &paddq ($car1,$car0);
- &paddq ($car1,$temp);
- &movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
-
- &lea ($i,&DWP(1,$i)); # i++
- &cmp ($i,$num);
- &jle (&label("outer"));
-
- &emms (); # done with mmx bank
- &jmp (&label("common_tail"));
-
-&set_label("non_sse2",16);
-}
-
-if (0) {
- &mov ("esp",$_sp);
- &xor ("eax","eax"); # signal "not fast enough [yet]"
- &jmp (&label("just_leave"));
- # While the below code provides competitive performance for
- # all key lengthes on modern Intel cores, it's still more
- # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
- # means compared to the original integer-only assembler.
- # 512-bit RSA sign is better by ~40%, but that's about all
- # one can say about all CPUs...
-} else {
-$inp="esi"; # integer path uses these registers differently
-$word="edi";
-$carry="ebp";
-
- &mov ($inp,$_ap);
- &lea ($carry,&DWP(1,$num));
- &mov ($word,$_bp);
- &xor ($j,$j); # j=0
- &mov ("edx",$inp);
- &and ($carry,1); # see if num is even
- &sub ("edx",$word); # see if ap==bp
- &lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
- &or ($carry,"edx");
- &mov ($word,&DWP(0,$word)); # bp[0]
- &jz (&label("bn_sqr_mont"));
- &mov ($_bpend,"eax");
- &mov ("eax",&DWP(0,$inp));
- &xor ("edx","edx");
-
-&set_label("mull",16);
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*bp[0]
- &add ($carry,"eax");
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
- &cmp ($j,$num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("mull"));
-
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*bp[0]
- &mov ($word,$_n0);
- &add ("eax",$carry);
- &mov ($inp,$_np);
- &adc ("edx",0);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
- &xor ($j,$j);
- &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
- &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
-
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &adc ("edx",0);
- &inc ($j);
-
- &jmp (&label("2ndmadd"));
-
-&set_label("1stmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*bp[i]
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("1stmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*bp[i]
- &add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &mov ($word,$_n0);
- &adc ("edx",0);
- &mov ($inp,$_np);
- &add ($carry,"eax");
- &adc ("edx",0);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &xor ($j,$j);
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
- &adc ($j,0);
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
- &mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &adc ("edx",0);
- &mov ($j,1);
-
-&set_label("2ndmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
- &jl (&label("2ndmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &adc ("edx",0);
- &add ($carry,"eax");
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
-
- &xor ("eax","eax");
- &mov ($j,$_bp); # &bp[i]
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
- &lea ($j,&DWP(4,$j));
- &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
- &cmp ($j,$_bpend);
- &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
- &je (&label("common_tail"));
-
- &mov ($word,&DWP(0,$j)); # bp[i+1]
- &mov ($inp,$_ap);
- &mov ($_bp,$j); # &bp[++i]
- &xor ($j,$j);
- &xor ("edx","edx");
- &mov ("eax",&DWP(0,$inp));
- &jmp (&label("1stmadd"));
-
-&set_label("bn_sqr_mont",16);
-$sbit=$num;
- &mov ($_num,$num);
- &mov ($_bp,$j); # i=0
-
- &mov ("eax",$word); # ap[0]
- &mul ($word); # ap[0]*ap[0]
- &mov (&DWP($frame,"esp"),"eax"); # tp[0]=
- &mov ($sbit,"edx");
- &shr ("edx",1);
- &and ($sbit,1);
- &inc ($j);
-&set_label("sqr",16);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*ap[0]
- &add ("eax",$carry);
- &lea ($j,&DWP(1,$j));
- &adc ("edx",0);
- &lea ($carry,&DWP(0,$sbit,"eax",2));
- &shr ("eax",31);
- &cmp ($j,$_num);
- &mov ($sbit,"eax");
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("sqr"));
-
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
- &mov ($carry,"edx");
- &mul ($word); # ap[num-1]*ap[0]
- &add ("eax",$carry);
- &mov ($word,$_n0);
- &adc ("edx",0);
- &mov ($inp,$_np);
- &lea ($carry,&DWP(0,$sbit,"eax",2));
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
- &shr ("eax",31);
- &mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
-
- &lea ($carry,&DWP(0,"eax","edx",2));
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &shr ("edx",31);
- &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
- &mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &mov ($num,$j);
- &adc ("edx",0);
- &mov ("eax",&DWP(4,$inp)); # np[1]
- &mov ($j,1);
-
-&set_label("3rdmadd",16);
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
-
- &mov ($carry,"edx");
- &mul ($word); # np[j+1]*m
- &add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
- &lea ($j,&DWP(2,$j));
- &adc ("edx",0);
- &add ($carry,"eax");
- &mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
- &adc ("edx",0);
- &cmp ($j,$num);
- &mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
- &jl (&label("3rdmadd"));
-
- &mov ($carry,"edx");
- &mul ($word); # np[j]*m
- &add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
- &adc ("edx",0);
- &add ($carry,"eax");
- &adc ("edx",0);
- &mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
-
- &mov ($j,$_bp); # i
- &xor ("eax","eax");
- &mov ($inp,$_ap);
- &add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
- &adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
- &mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
- &cmp ($j,$num);
- &mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
- &je (&label("common_tail"));
-
- &mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
- &lea ($j,&DWP(1,$j));
- &mov ("eax",$word);
- &mov ($_bp,$j); # ++i
- &mul ($word); # ap[i]*ap[i]
- &add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
- &adc ("edx",0);
- &mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
- &xor ($carry,$carry);
- &cmp ($j,$num);
- &lea ($j,&DWP(1,$j));
- &je (&label("sqrlast"));
-
- &mov ($sbit,"edx"); # zaps $num
- &shr ("edx",1);
- &and ($sbit,1);
-&set_label("sqradd",16);
- &mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
- &mov ($carry,"edx");
- &mul ($word); # ap[j]*ap[i]
- &add ("eax",$carry);
- &lea ($carry,&DWP(0,"eax","eax"));
- &adc ("edx",0);
- &shr ("eax",31);
- &add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
- &lea ($j,&DWP(1,$j));
- &adc ("eax",0);
- &add ($carry,$sbit);
- &adc ("eax",0);
- &cmp ($j,$_num);
- &mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
- &mov ($sbit,"eax");
- &jle (&label("sqradd"));
-
- &mov ($carry,"edx");
- &lea ("edx",&DWP(0,$sbit,"edx",2));
- &shr ($carry,31);
-&set_label("sqrlast");
- &mov ($word,$_n0);
- &mov ($inp,$_np);
- &imul ($word,&DWP($frame,"esp")); # n0*tp[0]
-
- &add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
- &mov ("eax",&DWP(0,$inp)); # np[0]
- &adc ($carry,0);
- &mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
- &mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
-
- &mul ($word); # np[0]*m
- &add ("eax",&DWP($frame,"esp")); # +=tp[0]
- &lea ($num,&DWP(-1,$j));
- &adc ("edx",0);
- &mov ($j,1);
- &mov ("eax",&DWP(4,$inp)); # np[1]
-
- &jmp (&label("3rdmadd"));
-}
-
-&set_label("common_tail",16);
- &mov ($np,$_np); # load modulus pointer
- &mov ($rp,$_rp); # load result pointer
- &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
-
- &mov ("eax",&DWP(0,$tp)); # tp[0]
- &mov ($j,$num); # j=num-1
- &xor ($i,$i); # i=0 and clear CF!
-
-&set_label("sub",16);
- &sbb ("eax",&DWP(0,$np,$i,4));
- &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
- &dec ($j); # doesn't affect CF!
- &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
- &lea ($i,&DWP(1,$i)); # i++
- &jge (&label("sub"));
-
- &sbb ("eax",0); # handle upmost overflow bit
- &and ($tp,"eax");
- &not ("eax");
- &mov ($np,$rp);
- &and ($np,"eax");
- &or ($tp,$np); # tp=carry?tp:rp
-
-&set_label("copy",16); # copy or in-place refresh
- &mov ("eax",&DWP(0,$tp,$num,4));
- &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
- &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
- &dec ($num);
- &jge (&label("copy"));
-
- &mov ("esp",$_sp); # pull saved stack pointer
- &mov ("eax",1);
-&set_label("just_leave");
-&function_end("bn_mul_mont");
-
-&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
-
-&asm_finish();
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index f13f52dd853e..2b2bc1ef60a1 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -1,3 +1,4 @@
+#include "../bn_lcl.h"
#ifdef __SUNPRO_C
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
#else
@@ -56,6 +57,9 @@
#define BN_ULONG unsigned long
+#undef mul
+#undef mul_add
+
/*
* "m"(a), "+m"(r) is the way to favor DirectPath -code;
* "g"(0) let the compiler to decide where does it
@@ -97,7 +101,7 @@
: "a"(a) \
: "cc");
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
@@ -121,7 +125,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
return(c1);
}
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1=0;
@@ -144,7 +148,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
return(c1);
}
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0) return;
@@ -175,7 +179,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
return ret;
}
-BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
@@ -198,7 +202,7 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
}
#ifndef SIMICS
-BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
+BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
{ BN_ULONG ret=0,i=0;
if (n <= 0) return 0;
@@ -485,7 +489,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
r[7]=c2;
}
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;
@@ -561,7 +565,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
r[15]=c1;
}
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG t1,t2;
BN_ULONG c1,c2,c3;