diff options
Diffstat (limited to 'crypto/rc4/asm/rc4-x86_64.pl')
-rwxr-xr-x | crypto/rc4/asm/rc4-x86_64.pl | 47 |
1 files changed, 33 insertions, 14 deletions
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 20722d3e7246..1a9cc47d7253 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== -# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -41,7 +48,7 @@ # April 2005 # -# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing +# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing # those with add/sub results in 50% performance improvement of folded # loop... @@ -50,7 +57,7 @@ # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T # performance by >30% [unlike P4 32-bit case that is]. But this is # provided that loads are reordered even more aggressively! Both code -# pathes, AMD64 and EM64T, reorder loads in essentially same manner +# paths, AMD64 and EM64T, reorder loads in essentially same manner # as my IA-64 implementation. On Opteron this resulted in modest 5% # improvement [I had to test it], while final Intel P4 performance # achieves respectful 432MBps on 2.8GHz processor now. For reference. @@ -81,7 +88,7 @@ # The only code path that was not modified is P4-specific one. Non-P4 # Intel code path optimization is heavily based on submission by Maxim # Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used -# some of the ideas even in attempt to optmize the original RC4_INT +# some of the ideas even in attempt to optimize the original RC4_INT # code path... Current performance in cycles per processed byte (less # is better) and improvement coefficients relative to previous # version of this module are: @@ -92,6 +99,9 @@ # Westmere 4.2/+60% # Sandy Bridge 4.2/+120% # Atom 9.3/+80% +# VIA Nano 6.4/+4% +# Ivy Bridge 4.1/+30% +# Bulldozer 4.5/+30%(*) # # (*) But corresponding loop has less instructions, which should have # positive effect on upcoming Bulldozer, which has one less ALU. @@ -112,7 +122,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $dat="%rdi"; # arg1 @@ -132,9 +142,13 @@ RC4: or $len,$len jne .Lentry ret .Lentry: +.cfi_startproc push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 .Lprologue: mov $len,%r11 mov $inp,%r12 @@ -417,11 +431,16 @@ $code.=<<___; movl $YY#d,-4($dat) mov (%rsp),%r13 +.cfi_restore %r13 mov 8(%rsp),%r12 +.cfi_restore %r12 mov 16(%rsp),%rbx +.cfi_restore %rbx add \$24,%rsp +.cfi_adjust_cfa_offset -24 .Lepilogue: ret +.cfi_endproc .size RC4,.-RC4 ___ } @@ -430,10 +449,10 @@ $idx="%r8"; $ido="%r9"; $code.=<<___; -.globl private_RC4_set_key -.type private_RC4_set_key,\@function,3 +.globl RC4_set_key +.type RC4_set_key,\@function,3 .align 16 -private_RC4_set_key: +RC4_set_key: lea 8($dat),$dat lea ($inp,$len),$inp neg $len @@ -500,7 +519,7 @@ private_RC4_set_key: mov %eax,-8($dat) mov %eax,-4($dat) ret -.size private_RC4_set_key,.-private_RC4_set_key +.size RC4_set_key,.-RC4_set_key .globl RC4_options .type RC4_options,\@abi-omnipotent @@ -645,16 +664,16 @@ key_se_handler: .rva .LSEH_end_RC4 .rva .LSEH_info_RC4 - .rva .LSEH_begin_private_RC4_set_key - .rva .LSEH_end_private_RC4_set_key - .rva .LSEH_info_private_RC4_set_key + .rva .LSEH_begin_RC4_set_key + .rva .LSEH_end_RC4_set_key + .rva .LSEH_info_RC4_set_key .section .xdata .align 8 .LSEH_info_RC4: .byte 9,0,0,0 .rva stream_se_handler -.LSEH_info_private_RC4_set_key: +.LSEH_info_RC4_set_key: .byte 9,0,0,0 .rva key_se_handler ___ |