aboutsummaryrefslogtreecommitdiff
path: root/crypto/aes/asm/aes-586.pl
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/aes/asm/aes-586.pl')
-rwxr-xr-xcrypto/aes/asm/aes-586.pl283
1 files changed, 145 insertions, 138 deletions
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl
index 687ed811be47..451d0e0ed1e2 100755
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -39,7 +39,7 @@
# but exhibits up to 10% improvement on other cores.
#
# Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
# This made it possible to implement little-endian variant of the
# algorithm without modifying the base C code. Motivating factor for
# the undertaken effort was that it appeared that in tight IA-32
@@ -103,11 +103,12 @@
# byte for 128-bit key.
#
# ECB encrypt ECB decrypt CBC large chunk
-# P4 56[60] 84[100] 23
-# AMD K8 48[44] 70[79] 18
-# PIII 41[50] 61[91] 24
-# Core 2 32[38] 45[70] 18.5
-# Pentium 120 160 77
+# P4 52[54] 83[95] 23
+# AMD K8 46[41] 66[70] 18
+# PIII 41[50] 60[77] 24
+# Core 2 31[36] 45[64] 18.5
+# Atom 76[100] 96[138] 60
+# Pentium 115 150 77
#
# Version 4.1 switches to compact S-box even in key schedule setup.
#
@@ -242,7 +243,7 @@ $vertical_spin=0; # shift "verticaly" defaults to 0, because of
sub encvert()
{ my ($te,@s) = @_;
- my $v0 = $acc, $v1 = $key;
+ my ($v0,$v1) = ($acc,$key);
&mov ($v0,$s[3]); # copy s3
&mov (&DWP(4,"esp"),$s[2]); # save s2
@@ -299,7 +300,7 @@ sub encvert()
# Another experimental routine, which features "horizontal spin," but
# eliminates one reference to stack. Strangely enough runs slower...
sub enchoriz()
-{ my $v0 = $key, $v1 = $acc;
+{ my ($v0,$v1) = ($key,$acc);
&movz ($v0,&LB($s0)); # 3, 2, 1, 0*
&rotr ($s2,8); # 8,11,10, 9
@@ -427,7 +428,7 @@ sub sse_encbody()
######################################################################
sub enccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
while ($#_>5) { pop(@_); $Fn=sub{}; }
my ($i,$te,@s)=@_;
my $tmp = $key;
@@ -476,24 +477,25 @@ sub enctransform()
my $tmp = $tbl;
my $r2 = $key ;
- &mov ($acc,$s[$i]);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$s[$i]);
&lea ($r2,&DWP(0,$s[$i],$s[$i]));
- &sub ($acc,$tmp);
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&and ($r2,0xfefefefe);
- &and ($acc,0x1b1b1b1b);
+ &sub ($acc,$tmp);
&mov ($tmp,$s[$i]);
+ &and ($acc,0x1b1b1b1b);
+ &rotr ($tmp,16);
&xor ($acc,$r2); # r2
+ &mov ($r2,$s[$i]);
&xor ($s[$i],$acc); # r0 ^ r2
+ &rotr ($r2,16+8);
+ &xor ($acc,$tmp);
&rotl ($s[$i],24);
- &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
- &rotr ($tmp,16);
- &xor ($s[$i],$tmp);
- &rotr ($tmp,8);
- &xor ($s[$i],$tmp);
+ &xor ($acc,$r2);
+ &mov ($tmp,0x80808080) if ($i!=1);
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
}
&function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+ &mov ($tbl,0x80808080);
&enctransform(2);
&enctransform(3);
&enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
&movd ("eax","mm1"); # 5, 4, 1, 0
&movd ("ebx","mm5"); # 15,14,11,10
+ &mov ($__key,$key);
&movz ($acc,&LB("eax")); # 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
&movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
- &shl ("edx",8); # 1
&shr ("eax",16); # 5, 4
+ &shl ("edx",8); # 1
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&shl ($acc,16); # 10
- &or ("ecx",$acc); # 10
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &or ("ecx",$acc); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 5
&shl ($acc,24); # 11
- &or ("edx",$acc); # 11
&shr ("ebx",16); # 15,14
+ &or ("edx",$acc); # 11
- &movz ($acc,&HB("eax")); # 5
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
&shl ($acc,8); # 5
&or ("ecx",$acc); # 5
- &movz ($acc,&HB("ebx")); # 15
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&LB("eax")); # 4
&shl ($acc,24); # 15
&or ("ecx",$acc); # 15
- &movd ("mm0","ecx"); # t[0] collected
- &movz ($acc,&LB("eax")); # 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&movd ("eax","mm2"); # 7, 6, 3, 2
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
- &shl ($acc,16); # 14
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 3
+ &shl ("ecx",16); # 14
+ &movd ("ebx","mm6"); # 13,12, 9, 8
&or ("ecx",$acc); # 14
- &movd ("ebx","mm6"); # 13,12, 9, 8
- &movz ($acc,&HB("eax")); # 3
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
+ &movz ($key,&HB("ebx")); # 9
&shl ($acc,24); # 3
&or ("ecx",$acc); # 3
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("ebx")); # 8
&shl ($acc,8); # 9
+ &shr ("ebx",16); # 13,12
&or ("ecx",$acc); # 9
- &movd ("mm1","ecx"); # t[1] collected
- &movz ($acc,&LB("ebx")); # 8
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
- &shr ("ebx",16); # 13,12
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
- &shl ($acc,16); # 2
- &or ("ecx",$acc); # 2
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
+ &movz ($key,&LB("eax")); # 2
&shr ("eax",16); # 7, 6
+ &movd ("mm1","ecx"); # t[1] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
+ &movz ($key,&HB("eax")); # 7
+ &shl ("ecx",16); # 2
+ &and ("eax",0xff); # 6
+ &or ("ecx",$acc); # 2
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&shl ($acc,24); # 7
- &or ("ecx",$acc); # 7
- &and ("eax",0xff); # 6
+ &and ("ebx",0xff); # 12
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
+ &or ("ecx",$acc); # 7
&shl ("eax",16); # 6
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
&or ("edx","eax"); # 6
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
&shl ($acc,8); # 13
- &or ("ecx",$acc); # 13
- &movd ("mm4","ecx"); # t[2] collected
- &and ("ebx",0xff); # 12
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
+ &or ("ecx",$acc); # 13
&or ("edx","ebx"); # 12
+ &mov ($key,$__key);
+ &movd ("mm4","ecx"); # t[2] collected
&movd ("mm5","edx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
@@ -1222,7 +1227,7 @@ sub enclast()
######################################################################
sub deccompact()
-{ my $Fn = mov;
+{ my $Fn = \&mov;
while ($#_>5) { pop(@_); $Fn=sub{}; }
my ($i,$td,@s)=@_;
my $tmp = $key;
@@ -1270,30 +1275,30 @@ sub dectransform()
my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
my $tp8 = $tbl;
- &mov ($acc,$s[$i]);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$s[$i]);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
- &xor ($acc,$tp2);
- &mov ($tp2,$acc);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &and ($tmp,$tp2);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp4,&DWP(0,$tp2,$tp2));
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$s[$i]); # tp2^tp1
- &xor ($acc,$tp4);
- &mov ($tp4,$acc);
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
+ &and ($tmp,$tp4);
+ &mov ($acc,$tmp);
&shr ($tmp,7);
&lea ($tp8,&DWP(0,$tp4,$tp4));
&sub ($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()
&xor ($s[$i],$tp2);
&xor ($tp2,$tp8);
- &rotl ($tp2,24);
&xor ($s[$i],$tp4);
&xor ($tp4,$tp8);
- &rotl ($tp4,16);
+ &rotl ($tp2,24);
&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
- &rotl ($tp8,8);
+ &rotl ($tp4,16);
&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
+ &rotl ($tp8,8);
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
&mov ($s[0],$__s0) if($i==2); #prefetch $s0
&mov ($s[1],$__s1) if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
sub sse_deccompact()
{
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
+ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
&movd ("eax","mm1"); # 7, 6, 1, 0
+ &movd ("ebx","mm5"); # 13,12,11,10
+ &mov ($__key,$key);
- &pshufw ("mm5","mm4",0x09); # 13,12,11,10
&movz ($acc,&LB("eax")); # 0
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
- &movd ("ebx","mm5"); # 13,12,11,10
&movz ("edx",&HB("eax")); # 1
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
+ &movz ($key,&LB("ebx")); # 10
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
+ &shr ("eax",16); # 7, 6
&shl ("edx",8); # 1
- &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
- &movz ($acc,&LB("ebx")); # 10
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
+ &movz ($key,&HB("ebx")); # 11
&shl ($acc,16); # 10
+ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
&or ("ecx",$acc); # 10
- &shr ("eax",16); # 7, 6
- &movz ($acc,&HB("ebx")); # 11
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
+ &movz ($key,&HB("eax")); # 7
&shl ($acc,24); # 11
- &or ("edx",$acc); # 11
&shr ("ebx",16); # 13,12
+ &or ("edx",$acc); # 11
- &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
- &movz ($acc,&HB("eax")); # 7
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
+ &movz ($key,&HB("ebx")); # 13
&shl ($acc,24); # 7
&or ("ecx",$acc); # 7
- &movz ($acc,&HB("ebx")); # 13
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
+ &movz ($key,&LB("eax")); # 6
&shl ($acc,8); # 13
+ &movd ("eax","mm2"); # 3, 2, 5, 4
&or ("ecx",$acc); # 13
- &movd ("mm0","ecx"); # t[0] collected
- &movz ($acc,&LB("eax")); # 6
- &movd ("eax","mm2"); # 3, 2, 5, 4
- &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
- &shl ("ecx",16); # 6
- &movz ($acc,&LB("ebx")); # 12
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
+ &movz ($key,&LB("ebx")); # 12
+ &shl ($acc,16); # 6
&movd ("ebx","mm6"); # 9, 8,15,14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
+ &movd ("mm0","ecx"); # t[0] collected
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
+ &movz ($key,&LB("eax")); # 4
&or ("ecx",$acc); # 12
- &movz ($acc,&LB("eax")); # 4
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
+ &movz ($key,&LB("ebx")); # 14
&or ("edx",$acc); # 4
- &movz ($acc,&LB("ebx")); # 14
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
+ &movz ($key,&HB("eax")); # 5
&shl ($acc,16); # 14
+ &shr ("eax",16); # 3, 2
&or ("edx",$acc); # 14
- &movd ("mm1","edx"); # t[1] collected
- &movz ($acc,&HB("eax")); # 5
- &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
- &shl ("edx",8); # 5
- &movz ($acc,&HB("ebx")); # 15
- &shr ("eax",16); # 3, 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
- &shl ($acc,24); # 15
- &or ("edx",$acc); # 15
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
+ &movz ($key,&HB("ebx")); # 15
&shr ("ebx",16); # 9, 8
+ &shl ($acc,8); # 5
+ &movd ("mm1","edx"); # t[1] collected
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
+ &movz ($key,&HB("ebx")); # 9
+ &shl ("edx",24); # 15
+ &and ("ebx",0xff); # 8
+ &or ("edx",$acc); # 15
&punpckldq ("mm0","mm1"); # t[0,1] collected
- &movz ($acc,&HB("ebx")); # 9
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
+ &movz ($key,&LB("eax")); # 2
&shl ($acc,8); # 9
- &or ("ecx",$acc); # 9
- &and ("ebx",0xff); # 8
+ &movz ("eax",&HB("eax")); # 3
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
+ &or ("ecx",$acc); # 9
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
&or ("edx","ebx"); # 8
- &movz ($acc,&LB("eax")); # 2
- &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
&shl ($acc,16); # 2
- &or ("edx",$acc); # 2
- &movd ("mm4","edx"); # t[2] collected
- &movz ("eax",&HB("eax")); # 3
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
+ &or ("edx",$acc); # 2
&shl ("eax",24); # 3
&or ("ecx","eax"); # 3
+ &mov ($key,$__key);
+ &movd ("mm4","edx"); # t[2] collected
&movd ("mm5","ecx"); # t[3] collected
&punpckldq ("mm4","mm5"); # t[2,3] collected
@@ -2181,8 +2188,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&mov ("ecx",240/4);
&xor ("eax","eax");
&align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_ezero")
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_ezero");
&mov ("esp",$_esp);
&popf ();
&set_label("drop_out");
@@ -2301,8 +2308,8 @@ my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
&mov ("ecx",240/4);
&xor ("eax","eax");
&align (4);
- &data_word(0xABF3F689); # rep stosd
- &set_label("skip_dzero")
+ &data_word(0xABF3F689); # rep stosd
+ &set_label("skip_dzero");
&mov ("esp",$_esp);
&popf ();
&function_end_A();
@@ -2865,32 +2872,32 @@ sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
my $tmp = $tbl;
- &mov ($acc,$tp1);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &mov ($tmp,0x80808080);
+ &and ($tmp,$tp1);
&lea ($tp2,&DWP(0,$tp1,$tp1));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&sub ($acc,$tmp);
&and ($tp2,0xfefefefe);
&and ($acc,0x1b1b1b1b);
- &xor ($acc,$tp2);
- &mov ($tp2,$acc);
+ &xor ($tp2,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$tp2);
&lea ($tp4,&DWP(0,$tp2,$tp2));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&sub ($acc,$tmp);
&and ($tp4,0xfefefefe);
&and ($acc,0x1b1b1b1b);
&xor ($tp2,$tp1); # tp2^tp1
- &xor ($acc,$tp4);
- &mov ($tp4,$acc);
+ &xor ($tp4,$acc);
+ &mov ($tmp,0x80808080);
- &and ($acc,0x80808080);
- &mov ($tmp,$acc);
- &shr ($tmp,7);
+ &and ($tmp,$tp4);
&lea ($tp8,&DWP(0,$tp4,$tp4));
+ &mov ($acc,$tmp);
+ &shr ($tmp,7);
&xor ($tp4,$tp1); # tp4^tp1
&sub ($acc,$tmp);
&and ($tp8,0xfefefefe);