diff options
Diffstat (limited to 'crypto/modes/asm/ghash-x86.pl')
-rwxr-xr-x | crypto/modes/asm/ghash-x86.pl | 35 |
1 files changed, 23 insertions, 12 deletions
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 0269169fa743..bcbe6e399d13 100755 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -88,7 +95,7 @@ # where Tproc is time required for Karatsuba pre- and post-processing, # is more realistic estimate. In this case it gives ... 1.91 cycles. # Or in other words, depending on how well we can interleave reduction -# and one of the two multiplications the performance should be betwen +# and one of the two multiplications the performance should be between # 1.91 and 2.16. As already mentioned, this implementation processes # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart # - in 2.02. x86_64 performance is better, because larger register @@ -96,14 +103,13 @@ # # Does it make sense to increase Naggr? To start with it's virtually # impossible in 32-bit mode, because of limited register bank -# capacity. Otherwise improvement has to be weighed agiainst slower +# capacity. Otherwise improvement has to be weighed against slower # setup, as well as code size and complexity increase. As even # optimistic estimate doesn't promise 30% performance improvement, # there are currently no plans to increase Naggr. # -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. # January 2010 # @@ -129,7 +135,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); $sse2=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -712,7 +721,7 @@ sub mmx_loop() { &pxor ($red[1],$red[1]); &pxor ($red[2],$red[2]); - # Just like in "May" verson modulo-schedule for critical path in + # Just like in "May" version modulo-schedule for critical path in # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' # is scheduled so late that rem_8bit[] has to be shifted *right* # by 16, which is why last argument to pinsrw is 2, which @@ -801,7 +810,7 @@ sub mmx_loop() { &bswap ($dat); &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 &bswap ("ebx"); - + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? &jne (&label("outer")); } @@ -905,7 +914,7 @@ my ($Xhi,$Xi) = @_; &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T1,8); # + &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # @@ -1075,7 +1084,7 @@ my ($Xhi,$Xi) = @_; &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T1,8); # + &psrldq ($T1,8); # &pxor ($Xi,$T2); &pxor ($Xhi,$T1); # &pshufd ($T1,$Xhn,0b01001110); @@ -1138,7 +1147,7 @@ my ($Xhi,$Xi) = @_; &movdqu (&QWP(0,$Xip),$Xi); &function_end("gcm_ghash_clmul"); -} else { # Algorith 5. Kept for reference purposes. +} else { # Algorithm 5. Kept for reference purposes. sub reduction_alg5 { # 19/16 times faster than Intel version my ($Xhi,$Xi)=@_; @@ -1369,6 +1378,8 @@ my ($Xhi,$Xi)=@_; &asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>"); &asm_finish(); +close STDOUT; + # A question was risen about choice of vanilla MMX. Or rather why wasn't # SSE2 chosen instead? In addition to the fact that MMX runs on legacy # CPUs such as PIII, "4-bit" MMX version was observed to provide better |