diff options
Diffstat (limited to 'crypto/modes/asm/ghash-x86_64.pl')
-rwxr-xr-x | crypto/modes/asm/ghash-x86_64.pl | 97 |
1 files changed, 74 insertions, 23 deletions
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index f889f2018789..afc30c3e72a4 100755 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -37,9 +44,8 @@ # See ghash-x86.pl for background information and details about coding # techniques. # -# Special thanks to David Woodhouse <dwmw2@infradead.org> for -# providing access to a Westmere-based system on behalf of Intel -# Open Source Technology Centre. +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # @@ -64,8 +70,11 @@ # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) +# Goldmont 1.08(+24%) # March 2013 # @@ -74,8 +83,10 @@ # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that -# it performs in 0.41 cycles per byte on Haswell processor, and in -# 0.29 on Broadwell. +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. +# +# Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest @@ -109,7 +120,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([ $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $do4xaggr=1; @@ -227,9 +238,21 @@ $code=<<___; .type gcm_gmult_4bit,\@function,2 .align 16 gcm_gmult_4bit: +.cfi_startproc push %rbx - push %rbp # %rbp and %r12 are pushed exclusively in +.cfi_push %rbx + push %rbp # %rbp and others are pushed exclusively in +.cfi_push %rbp push %r12 # order to reuse Win64 exception handler... +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$280,%rsp +.cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzb 15($Xi),$Zlo @@ -240,10 +263,15 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 16(%rsp),%rbx - lea 24(%rsp),%rsp + lea 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lgmult_epilogue: ret +.cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ @@ -257,13 +285,21 @@ $code.=<<___; .type gcm_ghash_4bit,\@function,4 .align 16 gcm_ghash_4bit: +.cfi_startproc push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 sub \$280,%rsp +.cfi_adjust_cfa_offset 280 .Lghash_prologue: mov $inp,%r14 # reassign couple of args mov $len,%r15 @@ -391,16 +427,25 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - lea 280(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + lea 280+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea 0(%rsi),%rsp +.cfi_def_cfa_register %rsp .Lghash_epilogue: ret +.cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit ___ @@ -460,7 +505,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # @@ -574,7 +619,7 @@ ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there - # no dependency between the two multiplications... + # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax mov \$0xA040608020C0E000,%r10 # ((7..0)ยท0xE0)&0xff mov \$0x07,%r11d @@ -749,7 +794,7 @@ $code.=<<___; movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 @@ -885,7 +930,7 @@ $code.=<<___; psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T1 # + psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # @@ -1639,14 +1684,20 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - lea 24(%rax),%rax # adjust "rsp" + lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi |