aboutsummaryrefslogtreecommitdiff
path: root/crypto/sha/asm/sha1-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/sha/asm/sha1-x86_64.pl')
-rwxr-xr-xcrypto/sha/asm/sha1-x86_64.pl1210
1 files changed, 1008 insertions, 202 deletions
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index f15c7ec39b20..9bb6b498190f 100755
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
@@ -49,17 +49,37 @@
#
# Add AVX code path. See sha1-586.pl for further information.
+# May 2013.
+#
+# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
+# and loading pair of consecutive blocks to 256-bit %ymm registers)
+# did not provide impressive performance improvement till a crucial
+# hint regarding the number of Xupdate iterations to pre-compute in
+# advance was provided by Ilya Albrekht of Intel Corp.
+
+# March 2014.
+#
+# Add support for Intel SHA Extensions.
+
######################################################################
# Current performance is summarized in following table. Numbers are
# CPU clock cycles spent to process single byte (less is better).
#
-# x86_64 SSSE3 AVX
-# P4 9.8 -
-# Opteron 6.6 -
-# Core2 6.7 6.1/+10% -
-# Atom 11.0 9.7/+13% -
-# Westmere 7.1 5.6/+27% -
-# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
+# x86_64 SSSE3 AVX[2]
+# P4 9.05 -
+# Opteron 6.26 -
+# Core2 6.55 6.05/+8% -
+# Westmere 6.73 5.30/+27% -
+# Sandy Bridge 7.70 6.10/+26% 4.99/+54%
+# Ivy Bridge 6.06 4.67/+30% 4.60/+32%
+# Haswell 5.45 4.15/+31% 3.57/+53%
+# Bulldozer 9.11 5.95/+53%
+# VIA Nano 9.32 7.15/+30%
+# Atom 10.3 9.17/+12%
+# Silvermont 13.1(*) 9.37/+40%
+#
+# (*) obviously suboptimal result, nothing was done about it,
+# because SSSE3 code is compiled unconditionally;
$flavour = shift;
$output = shift;
@@ -72,15 +92,27 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
- $1>=2.19);
-$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
- $1>=2.09);
-$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
- $1>=10);
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+}
+
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([2-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=1; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
@@ -97,7 +129,7 @@ $num="%r10";
$t0="%eax";
$t1="%ebx";
$t2="%ecx";
-@xi=("%edx","%ebp");
+@xi=("%edx","%ebp","%r14d");
$A="%esi";
$B="%edi";
$C="%r11d";
@@ -112,42 +144,40 @@ my $j=$i+1;
$code.=<<___ if ($i==0);
mov `4*$i`($inp),$xi[0]
bswap $xi[0]
- mov $xi[0],`4*$i`(%rsp)
___
$code.=<<___ if ($i<15);
- mov $c,$t0
mov `4*$j`($inp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*$i`(%rsp)
mov $a,$t2
- xor $d,$t0
bswap $xi[1]
+ xor $c,$t0
rol \$5,$t2
- lea 0x5a827999($xi[0],$e),$e
and $b,$t0
- mov $xi[1],`4*$j`(%rsp)
+ lea 0x5a827999($xi[0],$e),$e
add $t2,$e
xor $d,$t0
rol \$30,$b
add $t0,$e
___
$code.=<<___ if ($i>=15);
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ xor $c,$t0
rol \$5,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
and $b,$t0
lea 0x5a827999($xi[0],$e),$e
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
+ rol \$30,$b
xor $d,$t0
- rol \$1,$xi[1]
add $t2,$e
- rol \$30,$b
- mov $xi[1],`4*($j%16)`(%rsp)
+ rol \$1,$xi[1]
add $t0,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
sub BODY_20_39 {
@@ -155,62 +185,58 @@ my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
$code.=<<___ if ($i<79);
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $b,$t0
+ `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
mov $a,$t2
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- xor $b,$t0
+ xor $d,$t0
rol \$5,$t2
- lea $K($xi[0],$e),$e
xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t0
+ lea $K($xi[0],$e),$e
+ xor $c,$t0
add $t2,$e
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
rol \$30,$b
add $t0,$e
rol \$1,$xi[1]
___
-$code.=<<___ if ($i<76);
- mov $xi[1],`4*($j%16)`(%rsp)
-___
$code.=<<___ if ($i==79);
- mov $c,$t0
+ mov $b,$t0
mov $a,$t2
- xor $b,$t0
+ xor $d,$t0
lea $K($xi[0],$e),$e
rol \$5,$t2
- xor $d,$t0
+ xor $c,$t0
add $t2,$e
rol \$30,$b
add $t0,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
- mov `4*($j%16)`(%rsp),$xi[1]
- mov $c,$t0
- mov $c,$t1
+ xor `4*($j%16)`(%rsp),$xi[1]
+ mov $d,$t0
+ mov $xi[0],`4*($i%16)`(%rsp)
+ mov $d,$t1
xor `4*(($j+2)%16)`(%rsp),$xi[1]
- and $d,$t0
+ and $c,$t0
mov $a,$t2
xor `4*(($j+8)%16)`(%rsp),$xi[1]
- xor $d,$t1
lea 0x8f1bbcdc($xi[0],$e),$e
+ xor $c,$t1
rol \$5,$t2
- xor `4*(($j+13)%16)`(%rsp),$xi[1]
add $t0,$e
- and $b,$t1
rol \$1,$xi[1]
- add $t1,$e
- rol \$30,$b
- mov $xi[1],`4*($j%16)`(%rsp)
+ and $b,$t1
add $t2,$e
+ rol \$30,$b
+ add $t1,$e
___
-unshift(@xi,pop(@xi));
+push(@xi,shift(@xi));
}
$code.=<<___;
@@ -223,9 +249,19 @@ $code.=<<___;
sha1_block_data_order:
mov OPENSSL_ia32cap_P+0(%rip),%r9d
mov OPENSSL_ia32cap_P+4(%rip),%r8d
+ mov OPENSSL_ia32cap_P+8(%rip),%r10d
test \$`1<<9`,%r8d # check SSSE3 bit
jz .Lialu
___
+$code.=<<___ if ($shaext);
+ test \$`1<<29`,%r10d # check SHA bit
+ jnz _shaext_shortcut
+___
+$code.=<<___ if ($avx>1);
+ and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
+ cmp \$`1<<3|1<<5|1<<8`,%r10d
+ je _avx2_shortcut
+___
$code.=<<___ if ($avx);
and \$`1<<28`,%r8d # mask AVX bit
and \$`1<<30`,%r9d # mask "Intel CPU" bit
@@ -238,17 +274,18 @@ $code.=<<___;
.align 16
.Lialu:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
push %r13
- mov %rsp,%r11
+ push %r14
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
- mov %r11,`16*4`(%rsp)
+ mov %rax,`16*4`(%rsp)
.Lprologue:
mov 0($ctx),$A
@@ -282,53 +319,187 @@ $code.=<<___;
jnz .Lloop
mov `16*4`(%rsp),%rsi
- mov (%rsi),%r13
- mov 8(%rsi),%r12
- mov 16(%rsi),%rbp
- mov 24(%rsi),%rbx
- lea 32(%rsi),%rsp
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue:
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
+if ($shaext) {{{
+######################################################################
+# Intel SHA Extensions implementation of SHA1 update function.
+#
+my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
+my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
+my @MSG=map("%xmm$_",(4..7));
+
+$code.=<<___;
+.type sha1_block_data_order_shaext,\@function,3
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+___
+$code.=<<___ if ($win64);
+ lea `-8-4*16`(%rsp),%rsp
+ movaps %xmm6,-8-4*16(%rax)
+ movaps %xmm7,-8-3*16(%rax)
+ movaps %xmm8,-8-2*16(%rax)
+ movaps %xmm9,-8-1*16(%rax)
+.Lprologue_shaext:
+___
+$code.=<<___;
+ movdqu ($ctx),$ABCD
+ movd 16($ctx),$E
+ movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
+
+ movdqu ($inp),@MSG[0]
+ pshufd \$0b00011011,$ABCD,$ABCD # flip word order
+ movdqu 0x10($inp),@MSG[1]
+ pshufd \$0b00011011,$E,$E # flip word order
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[0]
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[1]
+ pshufb $BSWAP,@MSG[2]
+ movdqa $E,$E_SAVE # offload $E
+ pshufb $BSWAP,@MSG[3]
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ dec $num
+ lea 0x40($inp),%rax # next input block
+ paddd @MSG[0],$E
+ cmovne %rax,$inp
+ movdqa $ABCD,$ABCD_SAVE # offload $ABCD
+___
+for($i=0;$i<20-4;$i+=2) {
+$code.=<<___;
+ sha1msg1 @MSG[1],@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
+ sha1nexte @MSG[1],$E_
+ pxor @MSG[2],@MSG[0]
+ sha1msg1 @MSG[2],@MSG[1]
+ sha1msg2 @MSG[3],@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
+ sha1nexte @MSG[2],$E
+ pxor @MSG[3],@MSG[1]
+ sha1msg2 @MSG[0],@MSG[1]
+___
+ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ movdqu ($inp),@MSG[0]
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 64-67
+ sha1nexte @MSG[1],$E_
+ movdqu 0x10($inp),@MSG[1]
+ pshufb $BSWAP,@MSG[0]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 68-71
+ sha1nexte @MSG[2],$E
+ movdqu 0x20($inp),@MSG[2]
+ pshufb $BSWAP,@MSG[1]
+
+ movdqa $ABCD,$E_
+ sha1rnds4 \$3,$E,$ABCD # 72-75
+ sha1nexte @MSG[3],$E_
+ movdqu 0x30($inp),@MSG[3]
+ pshufb $BSWAP,@MSG[2]
+
+ movdqa $ABCD,$E
+ sha1rnds4 \$3,$E_,$ABCD # 76-79
+ sha1nexte $E_SAVE,$E
+ pshufb $BSWAP,@MSG[3]
+
+ paddd $ABCD_SAVE,$ABCD
+ movdqa $E,$E_SAVE # offload $E
+
+ jnz .Loop_shaext
+
+ pshufd \$0b00011011,$ABCD,$ABCD
+ pshufd \$0b00011011,$E,$E
+ movdqu $ABCD,($ctx)
+ movd $E,16($ctx)
+___
+$code.=<<___ if ($win64);
+ movaps -8-4*16(%rax),%xmm6
+ movaps -8-3*16(%rax),%xmm7
+ movaps -8-2*16(%rax),%xmm8
+ movaps -8-1*16(%rax),%xmm9
+ mov %rax,%rsp
+.Lepilogue_shaext:
+___
+$code.=<<___;
+ ret
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+___
+}}}
{{{
my $Xi=4;
my @X=map("%xmm$_",(4..7,0..3));
my @Tx=map("%xmm$_",(8..10));
+my $Kx="%xmm11";
my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
my @T=("%esi","%edi");
my $j=0;
+my $rx=0;
my $K_XX_XX="%r11";
my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
+{ my $sn;
+sub align32() {
+ ++$sn;
+$code.=<<___;
+ jmp .Lalign32_$sn # see "Decoded ICache" in manual
+.align 32
+.Lalign32_$sn:
+___
+}
+}
+
$code.=<<___;
.type sha1_block_data_order_ssse3,\@function,3
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
- lea `-64-($win64?5*16:0)`(%rsp),%rsp
+ push %r13 # redundant, done to share Win64 SE handler
+ push %r14
+ lea `-64-($win64?6*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,64+0(%rsp)
- movaps %xmm7,64+16(%rsp)
- movaps %xmm8,64+32(%rsp)
- movaps %xmm9,64+48(%rsp)
- movaps %xmm10,64+64(%rsp)
+ movaps %xmm6,-40-6*16(%rax)
+ movaps %xmm7,-40-5*16(%rax)
+ movaps %xmm8,-40-4*16(%rax)
+ movaps %xmm9,-40-3*16(%rax)
+ movaps %xmm10,-40-2*16(%rax)
+ movaps %xmm11,-40-1*16(%rax)
.Lprologue_ssse3:
___
$code.=<<___;
+ mov %rax,%r14 # original %rsp
+ and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
shl \$6,$num
add $inp,$num
- lea K_XX_XX(%rip),$K_XX_XX
+ lea K_XX_XX+64(%rip),$K_XX_XX
mov 0($ctx),$A # load context
mov 4($ctx),$B
@@ -336,19 +507,22 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
movdqa 64($K_XX_XX),@X[2] # pbswap mask
- movdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ movdqa -64($K_XX_XX),@Tx[1] # K_00_19
movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
movdqu 16($inp),@X[-3&7]
movdqu 32($inp),@X[-2&7]
movdqu 48($inp),@X[-1&7]
pshufb @X[2],@X[-4&7] # byte swap
- add \$64,$inp
pshufb @X[2],@X[-3&7]
pshufb @X[2],@X[-2&7]
- pshufb @X[2],@X[-1&7]
+ add \$64,$inp
paddd @Tx[1],@X[-4&7] # add K_00_19
+ pshufb @X[2],@X[-1&7]
paddd @Tx[1],@X[-3&7]
paddd @Tx[1],@X[-2&7]
movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
@@ -373,61 +547,61 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@X[0],@X[-3&7]);
- eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
eval(shift(@insns));
&movdqa (@Tx[0],@X[-1&7]);
- &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
+ &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&psrldq (@Tx[0],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
+
&pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
- eval(shift(@insns));
-
+ eval(shift(@insns)); # ror
&pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
eval(shift(@insns));
- eval(shift(@insns));
+ eval(shift(@insns)); # rol
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
&movdqa (@Tx[2],@X[0]);
- &movdqa (@Tx[0],@X[0]);
- eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &movdqa (@Tx[0],@X[0]);
eval(shift(@insns));
&pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
&paddd (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
&psrld (@Tx[0],31);
eval(shift(@insns));
+ eval(shift(@insns)); # rol
eval(shift(@insns));
&movdqa (@Tx[1],@Tx[2]);
eval(shift(@insns));
eval(shift(@insns));
&psrld (@Tx[2],30);
- &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
+ eval(shift(@insns)); # ror
+ &por (@X[0],@Tx[0]); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -435,12 +609,13 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
&pslld (@Tx[1],2);
&pxor (@X[0],@Tx[2]);
eval(shift(@insns));
- eval(shift(@insns));
- &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
+ eval(shift(@insns)); # rol
eval(shift(@insns));
eval(shift(@insns));
&pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
+ &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
foreach (@insns) { eval; } # remaining instructions [if any]
@@ -451,27 +626,30 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
- &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
- eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if ($Xi==8);
&pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
- &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
+ eval(shift(@insns)) if ($Xi==8);
+ eval(shift(@insns)); # body_20_39
eval(shift(@insns));
+ eval(shift(@insns)) if (@insns[1] =~ /_ror/);
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
+ &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
eval(shift(@insns));
eval(shift(@insns)); # rol
&pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
- eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
+ eval(shift(@insns));
if ($Xi%5) {
&movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
} else { # ... or load next one
- &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+ &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
}
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns)); # ror
+ &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
&pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
@@ -479,29 +657,31 @@ sub Xupdate_ssse3_32_79()
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # rol
+ eval(shift(@insns)) if (@insns[0] =~ /_ror/);
&movdqa (@Tx[0],@X[0]);
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
eval(shift(@insns)); # ror
eval(shift(@insns));
+ eval(shift(@insns)); # body_20_39
&pslld (@X[0],2);
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &psrld (@Tx[0],30);
eval(shift(@insns));
- eval(shift(@insns)); # rol
+ &psrld (@Tx[0],30);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns)); # ror
- eval(shift(@insns));
&por (@X[0],@Tx[0]); # "X[0]"<<<=2
- eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &movdqa (@Tx[1],@X[0]) if ($Xi<19);
+ eval(shift(@insns)); # body_20_39
+ eval(shift(@insns)) if (@insns[1] =~ /_rol/);
+ eval(shift(@insns)) if (@insns[0] =~ /_rol/);
+ &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -522,10 +702,11 @@ sub Xuplast_ssse3_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &paddd (@Tx[1],@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
+ &paddd (@Tx[1],@X[-1&7]);
+ eval(shift(@insns));
eval(shift(@insns));
&movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
@@ -538,7 +719,7 @@ sub Xuplast_ssse3_80()
unshift(@Tx,pop(@Tx));
&movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
- &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
&movdqu (@X[-4&7],"0($inp)"); # load input
&movdqu (@X[-3&7],"16($inp)");
&movdqu (@X[-2&7],"32($inp)");
@@ -557,9 +738,12 @@ sub Xloop_ssse3()
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
&pshufb (@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&paddd (@X[($Xi-4)&7],@Tx[1]);
eval(shift(@insns));
eval(shift(@insns));
@@ -568,6 +752,8 @@ sub Xloop_ssse3()
&movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
eval(shift(@insns));
eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
&psubd (@X[($Xi-4)&7],@Tx[1]);
foreach (@insns) { eval; }
@@ -583,51 +769,66 @@ sub Xtail_ssse3()
foreach (@insns) { eval; }
}
-sub body_00_19 () {
+sub body_00_19 () { # ((c^d)&b)^d
+ # on start @T[0]=(c^d)&b
+ return &body_20_39() if ($rx==19); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
- '&xor ($c,$d);',
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&xor ($c,$d);', # restore $c
- '&xor (@T[0],$d);',
- '&add ($e,$a);',
- '&$_ror ($b,$j?7:2);', # $b>>>2
- '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&$_ror ($b,$j?7:2)', # $b>>>2
+ '&xor (@T[0],$d)',
+ '&mov (@T[1],$a)', # $b for next round
+
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&xor ($b,$c)', # $c^$d for next round
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&and (@T[1],$b)', # ($b&($c^$d)) for next round
+
+ '&xor ($b,$c)', # restore $b
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_20_39 () {
+sub body_20_39 () { # b^d^c
+ # on entry @T[0]=b^d
+ return &body_40_59() if ($rx==39); $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
- '&xor (@T[0],$d);', # ($b^$d)
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&xor (@T[0],$c);', # ($b^$d^$c)
- '&add ($e,$a);',
- '&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&xor (@T[0],$d) if($j==19);'.
+ '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
+ '&mov (@T[1],$a)', # $b for next round
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
+
+ '&$_ror ($b,7)', # $b>>>2
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
-sub body_40_59 () {
+sub body_40_59 () { # ((b^c)&(c^d))^c
+ # on entry @T[0]=(b^c), (c^=d)
+ $rx++;
(
'($a,$b,$c,$d,$e)=@V;'.
- '&mov (@T[1],$c);',
- '&xor ($c,$d);',
- '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
- '&and (@T[1],$d);',
- '&and (@T[0],$c);', # ($b&($c^$d))
- '&$_ror ($b,7);', # $b>>>2
- '&add ($e,@T[1]);',
- '&mov (@T[1],$a);', # $b in next round
- '&$_rol ($a,5);',
- '&add ($e,@T[0]);',
- '&xor ($c,$d);', # restore $c
- '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+ '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
+ '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
+ '&xor ($c,$d) if ($j>=40)', # restore $c
+
+ '&$_ror ($b,7)', # $b>>>2
+ '&mov (@T[1],$a)', # $b for next round
+ '&xor (@T[0],$c)',
+
+ '&$_rol ($a,5)',
+ '&add ($e,@T[0])',
+ '&xor (@T[1],$c) if ($j==59);'.
+ '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
+
+ '&xor ($b,$c) if ($j< 59)', # c^d for next round
+ '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
$code.=<<___;
@@ -668,8 +869,11 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
+ mov $C,@T[1]
mov $D,12($ctx)
+ xor $D,@T[1]
mov $E,16($ctx)
+ and @T[1],@T[0]
jmp .Loop_ssse3
.align 16
@@ -694,31 +898,34 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps 64+0(%rsp),%xmm6
- movaps 64+16(%rsp),%xmm7
- movaps 64+32(%rsp),%xmm8
- movaps 64+48(%rsp),%xmm9
- movaps 64+64(%rsp),%xmm10
+ movaps -40-6*16(%r14),%xmm6
+ movaps -40-5*16(%r14),%xmm7
+ movaps -40-4*16(%r14),%xmm8
+ movaps -40-3*16(%r14),%xmm9
+ movaps -40-2*16(%r14),%xmm10
+ movaps -40-1*16(%r14),%xmm11
___
$code.=<<___;
- lea `64+($win64?5*16:0)`(%rsp),%rsi
- mov 0(%rsi),%r12
- mov 8(%rsi),%rbp
- mov 16(%rsi),%rbx
- lea 24(%rsi),%rsp
+ lea (%r14),%rsi
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_ssse3:
ret
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
___
if ($avx) {
-my $Xi=4;
-my @X=map("%xmm$_",(4..7,0..3));
-my @Tx=map("%xmm$_",(8..10));
-my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
-my @T=("%esi","%edi");
-my $j=0;
-my $K_XX_XX="%r11";
+$Xi=4; # reset variables
+@X=map("%xmm$_",(4..7,0..3));
+@Tx=map("%xmm$_",(8..10));
+$j=0;
+$rx=0;
+
+my $done_avx_label=".Ldone_avx";
my $_rol=sub { &shld(@_[0],@_) };
my $_ror=sub { &shrd(@_[0],@_) };
@@ -728,28 +935,34 @@ $code.=<<___;
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
+ mov %rsp,%rax
push %rbx
push %rbp
push %r12
- lea `-64-($win64?5*16:0)`(%rsp),%rsp
+ push %r13 # redundant, done to share Win64 SE handler
+ push %r14
+ lea `-64-($win64?6*16:0)`(%rsp),%rsp
+ vzeroupper
___
$code.=<<___ if ($win64);
- movaps %xmm6,64+0(%rsp)
- movaps %xmm7,64+16(%rsp)
- movaps %xmm8,64+32(%rsp)
- movaps %xmm9,64+48(%rsp)
- movaps %xmm10,64+64(%rsp)
+ vmovaps %xmm6,-40-6*16(%rax)
+ vmovaps %xmm7,-40-5*16(%rax)
+ vmovaps %xmm8,-40-4*16(%rax)
+ vmovaps %xmm9,-40-3*16(%rax)
+ vmovaps %xmm10,-40-2*16(%rax)
+ vmovaps %xmm11,-40-1*16(%rax)
.Lprologue_avx:
___
$code.=<<___;
+ mov %rax,%r14 # original %rsp
+ and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
- vzeroupper
shl \$6,$num
add $inp,$num
- lea K_XX_XX(%rip),$K_XX_XX
+ lea K_XX_XX+64(%rip),$K_XX_XX
mov 0($ctx),$A # load context
mov 4($ctx),$B
@@ -757,9 +970,12 @@ $code.=<<___;
mov 12($ctx),$D
mov $B,@T[0] # magic seed
mov 16($ctx),$E
+ mov $C,@T[1]
+ xor $D,@T[1]
+ and @T[1],@T[0]
vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
- vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
+ vmovdqa -64($K_XX_XX),$Kx # K_00_19
vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
vmovdqu 16($inp),@X[-3&7]
vmovdqu 32($inp),@X[-2&7]
@@ -769,9 +985,9 @@ $code.=<<___;
vpshufb @X[2],@X[-3&7],@X[-3&7]
vpshufb @X[2],@X[-2&7],@X[-2&7]
vpshufb @X[2],@X[-1&7],@X[-1&7]
- vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
- vpaddd @Tx[1],@X[-3&7],@X[1]
- vpaddd @Tx[1],@X[-2&7],@X[2]
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vpaddd $Kx,@X[-2&7],@X[2]
vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
vmovdqa @X[1],16(%rsp)
vmovdqa @X[2],32(%rsp)
@@ -790,10 +1006,10 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
- &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
eval(shift(@insns));
eval(shift(@insns));
&vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
@@ -843,7 +1059,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
&vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
eval(shift(@insns));
eval(shift(@insns));
- &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
+ &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
eval(shift(@insns));
eval(shift(@insns));
@@ -851,13 +1067,12 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
- push(@Tx,shift(@Tx));
}
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
+ my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
@@ -870,12 +1085,8 @@ sub Xupdate_avx_32_79()
&vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
- if ($Xi%5) {
- &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
- } else { # ... or load next one
- &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
- }
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
+ &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
eval(shift(@insns)); # ror
eval(shift(@insns));
@@ -905,7 +1116,6 @@ sub Xupdate_avx_32_79()
&vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
eval(shift(@insns)); # body_20_39
eval(shift(@insns));
- &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
eval(shift(@insns));
eval(shift(@insns)); # rol
eval(shift(@insns));
@@ -916,7 +1126,6 @@ sub Xupdate_avx_32_79()
foreach (@insns) { eval; } # remaining instructions
$Xi++; push(@X,shift(@X)); # "rotate" X[]
- push(@Tx,shift(@Tx));
}
sub Xuplast_avx_80()
@@ -926,23 +1135,21 @@ sub Xuplast_avx_80()
my ($a,$b,$c,$d,$e);
eval(shift(@insns));
- &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
+ &vpaddd (@Tx[1],$Kx,@X[-1&7]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
- &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
+ &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
foreach (@insns) { eval; } # remaining instructions
&cmp ($inp,$num);
- &je (".Ldone_avx");
-
- unshift(@Tx,pop(@Tx));
+ &je ($done_avx_label);
&vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
- &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
+ &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
&vmovdqu(@X[-4&7],"0($inp)"); # load input
&vmovdqu(@X[-3&7],"16($inp)");
&vmovdqu(@X[-2&7],"32($inp)");
@@ -964,7 +1171,7 @@ sub Xloop_avx()
&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
eval(shift(@insns));
eval(shift(@insns));
- &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+ &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
@@ -1024,12 +1231,15 @@ $code.=<<___;
mov @T[0],4($ctx)
mov @T[0],$B # magic seed
mov $C,8($ctx)
+ mov $C,@T[1]
mov $D,12($ctx)
+ xor $D,@T[1]
mov $E,16($ctx)
+ and @T[1],@T[0]
jmp .Loop_avx
.align 16
-.Ldone_avx:
+$done_avx_label:
___
$j=$saved_j; @V=@saved_V;
@@ -1052,31 +1262,520 @@ $code.=<<___;
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps 64+0(%rsp),%xmm6
- movaps 64+16(%rsp),%xmm7
- movaps 64+32(%rsp),%xmm8
- movaps 64+48(%rsp),%xmm9
- movaps 64+64(%rsp),%xmm10
+ movaps -40-6*16(%r14),%xmm6
+ movaps -40-5*16(%r14),%xmm7
+ movaps -40-4*16(%r14),%xmm8
+ movaps -40-3*16(%r14),%xmm9
+ movaps -40-2*16(%r14),%xmm10
+ movaps -40-1*16(%r14),%xmm11
___
$code.=<<___;
- lea `64+($win64?5*16:0)`(%rsp),%rsi
- mov 0(%rsi),%r12
- mov 8(%rsi),%rbp
- mov 16(%rsi),%rbx
- lea 24(%rsi),%rsp
+ lea (%r14),%rsi
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
.Lepilogue_avx:
ret
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
___
+
+if ($avx>1) {
+use integer;
+$Xi=4; # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+@Tx=map("%ymm$_",(8..10));
+$Kx="%ymm11";
+$j=0;
+
+my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
+my ($a5,$t0)=("%r12d","%edi");
+
+my ($A,$F,$B,$C,$D,$E)=@ROTX;
+my $rx=0;
+my $frame="%r13";
+
+$code.=<<___;
+.type sha1_block_data_order_avx2,\@function,3
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+ mov %rsp,%rax
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ lea -6*16(%rsp),%rsp
+ vmovaps %xmm6,-40-6*16(%rax)
+ vmovaps %xmm7,-40-5*16(%rax)
+ vmovaps %xmm8,-40-4*16(%rax)
+ vmovaps %xmm9,-40-3*16(%rax)
+ vmovaps %xmm10,-40-2*16(%rax)
+ vmovaps %xmm11,-40-1*16(%rax)
+.Lprologue_avx2:
+___
+$code.=<<___;
+ mov %rax,%r14 # original %rsp
+ mov %rdi,$ctx # reassigned argument
+ mov %rsi,$inp # reassigned argument
+ mov %rdx,$num # reassigned argument
+
+ lea -640(%rsp),%rsp
+ shl \$6,$num
+ lea 64($inp),$frame
+ and \$-128,%rsp
+ add $inp,$num
+ lea K_XX_XX+64(%rip),$K_XX_XX
+
+ mov 0($ctx),$A # load context
+ cmp $num,$frame
+ cmovae $inp,$frame # next or same block
+ mov 4($ctx),$F
+ mov 8($ctx),$C
+ mov 12($ctx),$D
+ mov 16($ctx),$E
+ vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
+
+ vmovdqu ($inp),%xmm0
+ vmovdqu 16($inp),%xmm1
+ vmovdqu 32($inp),%xmm2
+ vmovdqu 48($inp),%xmm3
+ lea 64($inp),$inp
+ vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
+ vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
+ vpshufb @X[2],@X[-4&7],@X[-4&7]
+ vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
+ vpshufb @X[2],@X[-3&7],@X[-3&7]
+ vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
+ vpshufb @X[2],@X[-2&7],@X[-2&7]
+ vmovdqu -64($K_XX_XX),$Kx # K_00_19
+ vpshufb @X[2],@X[-1&7],@X[-1&7]
+
+ vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
+ vpaddd $Kx,@X[-3&7],@X[1]
+ vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
+ vpaddd $Kx,@X[-2&7],@X[2]
+ vmovdqu @X[1],32(%rsp)
+ vpaddd $Kx,@X[-1&7],@X[3]
+ vmovdqu @X[2],64(%rsp)
+ vmovdqu @X[3],96(%rsp)
+___
+for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
+ use integer;
+
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ &vpsrld (@Tx[0],@X[0],31);
+ &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+$code.=<<___;
+ lea 128(%rsp),$frame
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorx \$2,$F,$B
+ andn $D,$F,$t0
+ and $C,$F
+ xor $t0,$F
+___
+sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
+ # at start $f=(b&c)^(~b&d), $b>>>=2
+ return &bodyx_20_39() if ($rx==19); $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+ '&andn ($t0,$a,$c)', # ~b&d for next round
+
+ '&add ($e,$f)', # e+=(b&c)^(~b&d)
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2)', # b>>>2 for next round
+ '&and ($a,$b)', # b&c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
+ # on entry $f=b^c^d, $b>>>=2
+ return &bodyx_40_59() if ($rx==39); $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+
+ '&lea ($e,"($e,$f)")', # e+=b^c^d
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
+ '&xor ($a,$b) if ($j<79)', # b^c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
+ # on entry $f=((b^c)&(c^d)), $b>>>=2
+ $rx++;
+ (
+ '($a,$f,$b,$c,$d,$e)=@ROTX;'.
+
+ '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
+ '&lea ($frame,"256($frame)") if ($j%32==31);',
+ '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
+ '&mov ($t0,$b) if ($j<59)', # count on zero latency
+ '&xor ($t0,$c) if ($j<59)', # c^d for next round
+
+ '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
+ '&rorx ($a5,$a,27)', # a<<<5
+ '&rorx ($f,$a,2)', # b>>>2 in next round
+ '&xor ($a,$b)', # b^c for next round
+
+ '&add ($e,$a5)', # e+=a<<<5
+ '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
+ '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
+
+ 'unshift(@ROTX,pop(@ROTX)); $j++;'
+ )
+}
+
+sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
+ &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],31);
+ &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
+ &vpaddd (@X[0],@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[1],@Tx[2],30);
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpslld (@Tx[2],@Tx[2],2);
+ &vpxor (@X[0],@X[0],@Tx[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ foreach (@insns) { eval; } # remaining instructions [if any]
+
+ $Xi++;
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xupdate_avx2_32_79()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
+ my ($a,$b,$c,$d,$e);
+
+ &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
+ &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
+ &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpsrld (@Tx[0],@X[0],30);
+ &vpslld (@X[0],@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ #&vpslld (@X[0],@X[0],2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vpaddd (@Tx[1],@X[0],$Kx);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
+
+ foreach (@insns) { eval; } # remaining instructions
+
+ $Xi++;
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xloop_avx2()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
+ my ($a,$b,$c,$d,$e);
+
+ foreach (@insns) { eval; }
+}
+
+ &align32();
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+ &Xupdate_avx2_32_79(\&bodyx_00_19);
+
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+ &Xupdate_avx2_32_79(\&bodyx_20_39);
+
+ &align32();
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+ &Xupdate_avx2_32_79(\&bodyx_40_59);
+
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+ &Xloop_avx2(\&bodyx_20_39);
+
+$code.=<<___;
+ lea 128($inp),$frame
+ lea 128($inp),%rdi # borrow $t0
+ cmp $num,$frame
+ cmovae $inp,$frame # next or previous block
+
+ # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+ add 0($ctx),@ROTX[0] # update context
+ add 4($ctx),@ROTX[1]
+ add 8($ctx),@ROTX[3]
+ mov @ROTX[0],0($ctx)
+ add 12($ctx),@ROTX[4]
+ mov @ROTX[1],4($ctx)
+ mov @ROTX[0],$A # A=d
+ add 16($ctx),@ROTX[5]
+ mov @ROTX[3],$a5
+ mov @ROTX[3],8($ctx)
+ mov @ROTX[4],$D # D=b
+ #xchg @ROTX[5],$F # F=c, C=f
+ mov @ROTX[4],12($ctx)
+ mov @ROTX[1],$F # F=e
+ mov @ROTX[5],16($ctx)
+ #mov $F,16($ctx)
+ mov @ROTX[5],$E # E=c
+ mov $a5,$C # C=f
+ #xchg $F,$E # E=c, F=e
+
+ cmp $num,$inp
+ je .Ldone_avx2
+___
+
+$Xi=4; # reset variables
+@X=map("%ymm$_",(4..7,0..3));
+
+$code.=<<___;
+ vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
+ cmp $num,%rdi # borrowed $t0
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
+ vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
+ vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
+ vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ lea 128+16(%rsp),$frame
+ rorx \$2,$F,$B
+ andn $D,$F,$t0
+ and $C,$F
+ xor $t0,$F
+ sub \$-128,$inp
+___
+ $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
+
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+ &Xloop_avx2 (\&bodyx_00_19);
+
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
+ &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
+ &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ("0(%rsp)",@Tx[0]);
+ &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
+ &vpaddd (@Tx[1],@X[-3&7],$Kx);
+ &Xloop_avx2 (\&bodyx_20_39);
+ &vmovdqu ("32(%rsp)",@Tx[1]);
+ &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
+ &vpaddd (@X[2],@X[-2&7],$Kx);
+
+ &Xloop_avx2 (\&bodyx_40_59);
+ &align32 ();
+ &vmovdqu ("64(%rsp)",@X[2]);
+ &vpaddd (@X[3],@X[-1&7],$Kx);
+ &Xloop_avx2 (\&bodyx_40_59);
+ &vmovdqu ("96(%rsp)",@X[3]);
+ &Xloop_avx2 (\&bodyx_40_59);
+ &Xupdate_avx2_16_31(\&bodyx_40_59);
+
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xupdate_avx2_16_31(\&bodyx_20_39);
+ &Xloop_avx2 (\&bodyx_20_39);
+
+$code.=<<___;
+ lea 128(%rsp),$frame
+
+ # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
+ add 0($ctx),@ROTX[0] # update context
+ add 4($ctx),@ROTX[1]
+ add 8($ctx),@ROTX[3]
+ mov @ROTX[0],0($ctx)
+ add 12($ctx),@ROTX[4]
+ mov @ROTX[1],4($ctx)
+ mov @ROTX[0],$A # A=d
+ add 16($ctx),@ROTX[5]
+ mov @ROTX[3],$a5
+ mov @ROTX[3],8($ctx)
+ mov @ROTX[4],$D # D=b
+ #xchg @ROTX[5],$F # F=c, C=f
+ mov @ROTX[4],12($ctx)
+ mov @ROTX[1],$F # F=e
+ mov @ROTX[5],16($ctx)
+ #mov $F,16($ctx)
+ mov @ROTX[5],$E # E=c
+ mov $a5,$C # C=f
+ #xchg $F,$E # E=c, F=e
+
+ cmp $num,$inp
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+___
+$code.=<<___ if ($win64);
+ movaps -40-6*16(%r14),%xmm6
+ movaps -40-5*16(%r14),%xmm7
+ movaps -40-4*16(%r14),%xmm8
+ movaps -40-3*16(%r14),%xmm9
+ movaps -40-2*16(%r14),%xmm10
+ movaps -40-1*16(%r14),%xmm11
+___
+$code.=<<___;
+ lea (%r14),%rsi
+ mov -40(%rsi),%r14
+ mov -32(%rsi),%r13
+ mov -24(%rsi),%r12
+ mov -16(%rsi),%rbp
+ mov -8(%rsi),%rbx
+ lea (%rsi),%rsp
+.Lepilogue_avx2:
+ ret
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+___
+}
}
$code.=<<___;
.align 64
K_XX_XX:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
___
}}}
$code.=<<___;
@@ -1122,20 +1821,58 @@ se_handler:
jae .Lcommon_seh_tail
mov `16*4`(%rax),%rax # pull saved stack pointer
- lea 32(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
+ mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler
+___
+
+$code.=<<___ if ($shaext);
+.type shaext_handler,\@abi-omnipotent
+.align 16
+shaext_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ lea .Lprologue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ lea .Lepilogue_shaext(%rip),%r10
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea -8-4*16(%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$8,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ jmp .Lcommon_seh_tail
+.size shaext_handler,.-shaext_handler
+___
+
+$code.=<<___;
.type ssse3_handler,\@abi-omnipotent
.align 16
ssse3_handler:
@@ -1168,18 +1905,23 @@ ssse3_handler:
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- lea 64(%rax),%rsi
+ mov 232($context),%rax # pull context->R14
+
+ lea -40-6*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
- mov \$10,%ecx
+ mov \$12,%ecx
.long 0xa548f3fc # cld; rep movsq
- lea `24+64+5*16`(%rax),%rax # adjust stack pointer
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore cotnext->R12
+ mov %r13,224($context) # restore cotnext->R13
+ mov %r14,232($context) # restore cotnext->R14
.Lcommon_seh_tail:
mov 8(%rax),%rdi
@@ -1226,6 +1968,13 @@ ssse3_handler:
.rva .LSEH_begin_sha1_block_data_order
.rva .LSEH_end_sha1_block_data_order
.rva .LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_sha1_block_data_order_shaext
+ .rva .LSEH_end_sha1_block_data_order_shaext
+ .rva .LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
.rva .LSEH_begin_sha1_block_data_order_ssse3
.rva .LSEH_end_sha1_block_data_order_ssse3
.rva .LSEH_info_sha1_block_data_order_ssse3
@@ -1235,12 +1984,24 @@ $code.=<<___ if ($avx);
.rva .LSEH_end_sha1_block_data_order_avx
.rva .LSEH_info_sha1_block_data_order_avx
___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_sha1_block_data_order_avx2
+ .rva .LSEH_end_sha1_block_data_order_avx2
+ .rva .LSEH_info_sha1_block_data_order_avx2
+___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_sha1_block_data_order:
.byte 9,0,0,0
.rva se_handler
+___
+$code.=<<___ if ($shaext);
+.LSEH_info_sha1_block_data_order_shaext:
+ .byte 9,0,0,0
+ .rva shaext_handler
+___
+$code.=<<___;
.LSEH_info_sha1_block_data_order_ssse3:
.byte 9,0,0,0
.rva ssse3_handler
@@ -1252,10 +2013,55 @@ $code.=<<___ if ($avx);
.rva ssse3_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
+$code.=<<___ if ($avx>1);
+.LSEH_info_sha1_block_data_order_avx2:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
+___
}
####################################################################
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+sub sha1rnds4 {
+ if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
+ my @opcode=(0x0f,0x3a,0xcc);
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
+ my $c=$1;
+ push @opcode,$c=~/^0/?oct($c):$c;
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return "sha1rnds4\t".@_[0];
+ }
+}
+
+sub sha1op38 {
+ my $instr = shift;
+ my %opcodelet = (
+ "sha1nexte" => 0xc8,
+ "sha1msg1" => 0xc9,
+ "sha1msg2" => 0xca );
+
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+ my @opcode=(0x0f,0x38);
+ my $rex=0;
+ $rex|=0x04 if ($2>=8);
+ $rex|=0x01 if ($1>=8);
+ unshift @opcode,0x40|$rex if ($rex);
+ push @opcode,$opcodelet{$instr};
+ push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
+ return ".byte\t".join(',',@opcode);
+ } else {
+ return $instr."\t".@_[0];
+ }
+}
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
+ s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
+
+ print $_,"\n";
+}
close STDOUT;