diff options
Diffstat (limited to 'crypto/poly1305/asm')
-rwxr-xr-x | crypto/poly1305/asm/poly1305-armv4.pl | 53 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-armv8.pl | 69 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-c64xplus.pl | 5 | ||||
-rw-r--r-- | crypto/poly1305/asm/poly1305-ia64.S | 365 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-mips.pl | 10 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-ppc.pl | 1564 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-ppcfp.pl | 10 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-s390x.pl | 1123 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-sparcv9.pl | 18 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-x86.pl | 5 | ||||
-rwxr-xr-x | crypto/poly1305/asm/poly1305-x86_64.pl | 20 |
11 files changed, 2854 insertions, 388 deletions
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl index 70f46cd140aa..041bfd46e699 100755 --- a/crypto/poly1305/asm/poly1305-armv4.pl +++ b/crypto/poly1305/asm/poly1305-armv4.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -28,9 +28,10 @@ # the cost of 15/12% regression on Cortex-A5/A7, it's even possible # to improve Cortex-A9 result, but then A5/A7 loose more than 20%; -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; @@ -38,9 +39,10 @@ if ($flavour && $flavour ne "void") { ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; - open STDOUT,"| \"$^X\" $xlate $flavour $output"; + open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; } else { - open STDOUT,">$output"; + $output and open STDOUT,">$output"; } ($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); @@ -48,7 +50,6 @@ if ($flavour && $flavour ne "void") { $code.=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -56,6 +57,8 @@ $code.=<<___; .code 32 #endif +.text + .globl poly1305_emit .globl poly1305_blocks .globl poly1305_init @@ -100,8 +103,10 @@ poly1305_init: and r4,r4,r10 #if __ARM_MAX_ARCH__>=7 +# if !defined(_WIN32) ldr r12,[r11,r12] @ OPENSSL_armcap_P -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] # endif #endif @@ -116,32 +121,22 @@ poly1305_init: #if __ARM_MAX_ARCH__>=7 tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks -# ifdef __thumb2__ - it ne -# endif +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + adr r12,.Lpoly1305_emit + adr r10,.Lpoly1305_emit_neon + itt ne movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif movne r12,r10 + orr r11,r11,#1 @ thumb-ify address + orr r12,r12,#1 # else -# ifdef __thumb2__ - itete eq -# endif addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) # endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 -# endif #endif ldrb r9,[$inp,#11] orr r6,r6,r7,lsl#8 @@ -1232,7 +1227,11 @@ poly1305_emit_neon: .Lzeros: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lpoly1305_init +# endif #endif ___ } } diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl index 2a42b64a929c..dc39f4053fe6 100755 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/poly1305/asm/poly1305-armv8.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -29,20 +29,24 @@ # X-Gene 2.13/+68% 2.27 # Mongoose 1.77/+75% 1.12 # Kryo 2.70/+55% 1.13 +# ThunderX2 1.17/+95% 1.36 # # (*) estimate based on resources availability is less than 1.0, # i.e. measured result is worse than expected, presumably binary # translator is not almighty; -$flavour=shift; -$output=shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); @@ -75,18 +79,13 @@ poly1305_init: csel x0,xzr,x0,eq b.eq .Lno_key -#ifdef __ILP32__ - ldrsw $t1,.LOPENSSL_armcap_P -#else - ldr $t1,.LOPENSSL_armcap_P -#endif - adr $t0,.LOPENSSL_armcap_P + adrp x17,OPENSSL_armcap_P + ldr w17,[x17,#:lo12:OPENSSL_armcap_P] ldp $r0,$r1,[$inp] // load key mov $s1,#0xfffffffc0fffffff movk $s1,#0x0fff,lsl#48 - ldr w17,[$t0,$t1] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev $r0,$r0 // flip bytes rev $r1,$r1 #endif @@ -97,10 +96,10 @@ poly1305_init: tst w17,#ARMV7_NEON - adr $d0,poly1305_blocks - adr $r0,poly1305_blocks_neon - adr $d1,poly1305_emit - adr $r1,poly1305_emit_neon + adr $d0,.Lpoly1305_blocks + adr $r0,.Lpoly1305_blocks_neon + adr $d1,.Lpoly1305_emit + adr $r1,.Lpoly1305_emit_neon csel $d0,$d0,$r0,eq csel $d1,$d1,$r1,eq @@ -119,6 +118,7 @@ poly1305_init: .type poly1305_blocks,%function .align 5 poly1305_blocks: +.Lpoly1305_blocks: ands $len,$len,#-16 b.eq .Lno_data @@ -132,7 +132,7 @@ poly1305_blocks: .Loop: ldp $t0,$t1,[$inp],#16 // load input sub $len,$len,#16 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev $t0,$t0 rev $t1,$t1 #endif @@ -183,6 +183,7 @@ poly1305_blocks: .type poly1305_emit,%function .align 5 poly1305_emit: +.Lpoly1305_emit: ldp $h0,$h1,[$ctx] // load hash base 2^64 ldr $h2,[$ctx,#16] ldp $t0,$t1,[$nonce] // load nonce @@ -196,13 +197,13 @@ poly1305_emit: csel $h0,$h0,$d0,eq csel $h1,$h1,$d1,eq -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror $t0,$t0,#32 // flip nonce words ror $t1,$t1,#32 #endif adds $h0,$h0,$t0 // accumulate nonce adc $h1,$h1,$t1 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev $h0,$h0 // flip output bytes rev $h1,$h1 #endif @@ -289,10 +290,11 @@ poly1305_splat: .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: +.Lpoly1305_blocks_neon: ldr $is_base2_26,[$ctx,#24] cmp $len,#128 b.hs .Lblocks_neon - cbz $is_base2_26,poly1305_blocks + cbz $is_base2_26,.Lpoly1305_blocks .Lblocks_neon: .inst 0xd503233f // paciasp @@ -333,7 +335,7 @@ poly1305_blocks_neon: adcs $h1,$h1,xzr adc $h2,$h2,xzr -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev $d0,$d0 rev $d1,$d1 #endif @@ -379,7 +381,7 @@ poly1305_blocks_neon: ldp $d0,$d1,[$inp],#16 // load input sub $len,$len,#16 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev $d0,$d0 rev $d1,$d1 #endif @@ -435,7 +437,7 @@ poly1305_blocks_neon: csel $in2,$zeros,$in2,lo mov x4,#1 - str x4,[$ctx,#-24] // set is_base2_26 + stur x4,[$ctx,#-24] // set is_base2_26 sub $ctx,$ctx,#48 // restore original $ctx b .Ldo_neon @@ -464,7 +466,7 @@ poly1305_blocks_neon: lsl $padbit,$padbit,#24 add x15,$ctx,#48 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -500,7 +502,7 @@ poly1305_blocks_neon: ld1 {$S2,$R3,$S3,$R4},[x15],#64 ld1 {$S4},[x15] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -561,7 +563,7 @@ poly1305_blocks_neon: umull $ACC1,$IN23_0,${R1}[2] ldp x9,x13,[$in2],#48 umull $ACC0,$IN23_0,${R0}[2] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -626,7 +628,7 @@ poly1305_blocks_neon: umlal $ACC4,$IN01_2,${R2}[0] umlal $ACC1,$IN01_2,${S4}[0] umlal $ACC2,$IN01_2,${R0}[0] -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 @@ -872,6 +874,7 @@ poly1305_blocks_neon: .type poly1305_emit_neon,%function .align 5 poly1305_emit_neon: +.Lpoly1305_emit_neon: ldr $is_base2_26,[$ctx,#24] cbz $is_base2_26,poly1305_emit @@ -906,13 +909,13 @@ poly1305_emit_neon: csel $h0,$h0,$d0,eq csel $h1,$h1,$d1,eq -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ ror $t0,$t0,#32 // flip nonce words ror $t1,$t1,#32 #endif adds $h0,$h0,$t0 // accumulate nonce adc $h1,$h1,$t1 -#ifdef __ARMEB__ +#ifdef __AARCH64EB__ rev $h0,$h0 // flip output bytes rev $h1,$h1 #endif @@ -924,12 +927,6 @@ poly1305_emit_neon: .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 -.LOPENSSL_armcap_P: -#ifdef __ILP32__ -.long OPENSSL_armcap_P-. -#else -.quad OPENSSL_armcap_P-. -#endif .asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" .align 2 ___ diff --git a/crypto/poly1305/asm/poly1305-c64xplus.pl b/crypto/poly1305/asm/poly1305-c64xplus.pl index 93fef37e605b..2bcdced7f45c 100755 --- a/crypto/poly1305/asm/poly1305-c64xplus.pl +++ b/crypto/poly1305/asm/poly1305-c64xplus.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -26,8 +26,7 @@ # time dependent on input length. This module on the other hand is free # from such limitation. -$output=pop; -open STDOUT,">$output"; +$output=pop and open STDOUT,">$output"; ($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6"); ($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN); diff --git a/crypto/poly1305/asm/poly1305-ia64.S b/crypto/poly1305/asm/poly1305-ia64.S new file mode 100644 index 000000000000..54d6454f0322 --- /dev/null +++ b/crypto/poly1305/asm/poly1305-ia64.S @@ -0,0 +1,365 @@ +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL +// project. +// ==================================================================== +// +// Poly1305 for Itanium. +// +// January 2019 +// +// Performance was reported to be ~2.1 cycles per byte on Itanium 2. +// With exception for processors in 95xx family, which have higher +// floating-point instructions' latencies and deliver ~2.6 cpb. +// Comparison to compiler-generated code is not exactly fair, because +// of different radixes. But just for reference, it was observed to be +// >3x faster. Originally it was argued that floating-point base 2^32 +// implementation would be optimal. Upon closer look estimate for below +// integer base 2^64 implementation turned to be approximately same on +// Itanium 2. But floating-point code would be larger, and have higher +// overhead, which would negatively affect small-block performance... + +#if defined(_HPUX_SOURCE) +# if !defined(_LP64) +# define ADDP addp4 +# else +# define ADDP add +# endif +# define RUM rum +# define SUM sum +#else +# define ADDP add +# define RUM nop +# define SUM nop +#endif + +.text +.explicit + +.global poly1305_init# +.proc poly1305_init# +.align 64 +poly1305_init: + .prologue + .save ar.pfs,r2 +{ .mmi; alloc r2=ar.pfs,2,0,0,0 + cmp.eq p6,p7=0,r33 } // key == NULL? +{ .mmi; ADDP r9=8,r32 + ADDP r10=16,r32 + ADDP r32=0,r32 };; + .body +{ .mmi; st8 [r32]=r0,24 // ctx->h0 = 0 + st8 [r9]=r0 // ctx->h1 = 0 +(p7) ADDP r8=0,r33 } +{ .mib; st8 [r10]=r0 // ctx->h2 = 0 +(p6) mov r8=0 +(p6) br.ret.spnt b0 };; + +{ .mmi; ADDP r9=1,r33 + ADDP r10=2,r33 + ADDP r11=3,r33 };; +{ .mmi; ld1 r16=[r8],4 // load key, little-endian + ld1 r17=[r9],4 } +{ .mmi; ld1 r18=[r10],4 + ld1 r19=[r11],4 };; +{ .mmi; ld1 r20=[r8],4 + ld1 r21=[r9],4 } +{ .mmi; ld1 r22=[r10],4 + ld1 r23=[r11],4 + and r19=15,r19 };; +{ .mmi; ld1 r24=[r8],4 + ld1 r25=[r9],4 + and r20=-4,r20 } +{ .mmi; ld1 r26=[r10],4 + ld1 r27=[r11],4 + and r23=15,r23 };; +{ .mmi; ld1 r28=[r8],4 + ld1 r29=[r9],4 + and r24=-4,r24 } +{ .mmi; ld1 r30=[r10],4 + ld1 r31=[r11],4 + and r27=15,r27 };; + +{ .mii; and r28=-4,r28 + dep r16=r17,r16,8,8 + dep r18=r19,r18,8,8 };; +{ .mii; and r31=15,r31 + dep r16=r18,r16,16,16 + dep r20=r21,r20,8,8 };; +{ .mii; dep r16=r20,r16,32,16 + dep r22=r23,r22,8,8 };; +{ .mii; dep r16=r22,r16,48,16 + dep r24=r25,r24,8,8 };; +{ .mii; dep r26=r27,r26,8,8 + dep r28=r29,r28,8,8 };; +{ .mii; dep r24=r26,r24,16,16 + dep r30=r31,r30,8,8 };; +{ .mii; st8 [r32]=r16,8 // ctx->r0 + dep r24=r28,r24,32,16;; + dep r24=r30,r24,48,16 };; +{ .mii; st8 [r32]=r24,8 // ctx->r1 + shr.u r25=r24,2;; + add r25=r25,r24 };; +{ .mib; st8 [r32]=r25 // ctx->s1 + mov r8=0 + br.ret.sptk b0 };; +.endp poly1305_init# + +h0=r17; h1=r18; h2=r19; +i0=r20; i1=r21; +HF0=f8; HF1=f9; HF2=f10; +RF0=f11; RF1=f12; SF1=f13; + +.global poly1305_blocks# +.proc poly1305_blocks# +.align 64 +poly1305_blocks: + .prologue + .save ar.pfs,r2 +{ .mii; alloc r2=ar.pfs,4,1,0,0 + .save ar.lc,r3 + mov r3=ar.lc + .save pr,r36 + mov r36=pr } + + .body +{ .mmi; ADDP r8=0,r32 + ADDP r9=8,r32 + and r29=7,r33 };; +{ .mmi; ld8 h0=[r8],16 + ld8 h1=[r9],16 + and r33=-8,r33 };; +{ .mmi; ld8 h2=[r8],16 + ldf8 RF0=[r9],16 + shr.u r34=r34,4 };; +{ .mmi; ldf8 RF1=[r8],-32 + ldf8 SF1=[r9],-32 + cmp.ltu p16,p17=1,r34 };; +{ .mmi; +(p16) add r34=-2,r34 +(p17) mov r34=0 + ADDP r10=0,r33 } +{ .mii; ADDP r11=8,r33 +(p16) mov ar.ec=2 +(p17) mov ar.ec=1 };; +{ .mib; RUM 1<<1 // go little-endian + mov ar.lc=r34 + brp.loop.imp .Loop,.Lcend-16 } + +{ .mmi; cmp.eq p8,p7=0,r29 + cmp.eq p9,p0=1,r29 + cmp.eq p10,p0=2,r29 } +{ .mmi; cmp.eq p11,p0=3,r29 + cmp.eq p12,p0=4,r29 + cmp.eq p13,p0=5,r29 } +{ .mmi; cmp.eq p14,p0=6,r29 + cmp.eq p15,p0=7,r29 + add r16=16,r10 };; + +{ .mmb; +(p8) ld8 i0=[r10],16 // aligned input +(p8) ld8 i1=[r11],16 +(p8) br.cond.sptk .Loop };; + + // align first block + .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15 +{ .mmi; (p7) ld8 r14=[r10],24 + (p7) ld8 r15=[r11],24 } + +{ .mii; (p7) ld8 r16=[r16] + nop.i 0;; + (p15) shrp i0=r15,r14,56 } +{ .mii; (p15) shrp i1=r16,r15,56 + (p14) shrp i0=r15,r14,48 } +{ .mii; (p14) shrp i1=r16,r15,48 + (p13) shrp i0=r15,r14,40 } +{ .mii; (p13) shrp i1=r16,r15,40 + (p12) shrp i0=r15,r14,32 } +{ .mii; (p12) shrp i1=r16,r15,32 + (p11) shrp i0=r15,r14,24 } +{ .mii; (p11) shrp i1=r16,r15,24 + (p10) shrp i0=r15,r14,16 } +{ .mii; (p10) shrp i1=r16,r15,16 + (p9) shrp i0=r15,r14,8 } +{ .mii; (p9) shrp i1=r16,r15,8 + mov r14=r16 };; + +.Loop: + .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15 +{ .mmi; add h0=h0,i0 + add h1=h1,i1 + add h2=h2,r35 };; +{ .mmi; setf.sig HF0=h0 + cmp.ltu p6,p0=h0,i0 + cmp.ltu p7,p0=h1,i1 };; +{ .mmi; (p6) add h1=1,h1;; + setf.sig HF1=h1 + (p6) cmp.eq.or p7,p0=0,h1 };; +{ .mmi; (p7) add h2=1,h2;; + setf.sig HF2=h2 };; + +{ .mfi; (p16) ld8 r15=[r10],16 + xmpy.lu f32=HF0,RF0 } +{ .mfi; (p16) ld8 r16=[r11],16 + xmpy.hu f33=HF0,RF0 } +{ .mfi; xmpy.lu f36=HF0,RF1 } +{ .mfi; xmpy.hu f37=HF0,RF1 };; +{ .mfi; xmpy.lu f34=HF1,SF1 + (p15) shrp i0=r15,r14,56 } +{ .mfi; xmpy.hu f35=HF1,SF1 } +{ .mfi; xmpy.lu f38=HF1,RF0 + (p15) shrp i1=r16,r15,56 } +{ .mfi; xmpy.hu f39=HF1,RF0 } +{ .mfi; xmpy.lu f40=HF2,SF1 + (p14) shrp i0=r15,r14,48 } +{ .mfi; xmpy.lu f41=HF2,RF0 };; + +{ .mmi; getf.sig r22=f32 + getf.sig r23=f33 + (p14) shrp i1=r16,r15,48 } +{ .mmi; getf.sig r24=f34 + getf.sig r25=f35 + (p13) shrp i0=r15,r14,40 } +{ .mmi; getf.sig r26=f36 + getf.sig r27=f37 + (p13) shrp i1=r16,r15,40 } +{ .mmi; getf.sig r28=f38 + getf.sig r29=f39 + (p12) shrp i0=r15,r14,32 } +{ .mmi; getf.sig r30=f40 + getf.sig r31=f41 };; + +{ .mmi; add h0=r22,r24 + add r23=r23,r25 + (p12) shrp i1=r16,r15,32 } +{ .mmi; add h1=r26,r28 + add r27=r27,r29 + (p11) shrp i0=r15,r14,24 };; +{ .mmi; cmp.ltu p6,p0=h0,r24 + cmp.ltu p7,p0=h1,r28 + add r23=r23,r30 };; +{ .mmi; (p6) add r23=1,r23 + (p7) add r27=1,r27 + (p11) shrp i1=r16,r15,24 };; +{ .mmi; add h1=h1,r23;; + cmp.ltu p6,p7=h1,r23 + (p10) shrp i0=r15,r14,16 };; +{ .mmi; (p6) add h2=r31,r27,1 + (p7) add h2=r31,r27 + (p10) shrp i1=r16,r15,16 };; + +{ .mmi; (p8) mov i0=r15 + and r22=-4,h2 + shr.u r23=h2,2 };; +{ .mmi; add r22=r22,r23 + and h2=3,h2 + (p9) shrp i0=r15,r14,8 };; + +{ .mmi; add h0=h0,r22;; + cmp.ltu p6,p0=h0,r22 + (p9) shrp i1=r16,r15,8 };; +{ .mmi; (p8) mov i1=r16 + (p6) cmp.eq.unc p7,p0=-1,h1 + (p6) add h1=1,h1 };; +{ .mmb; (p7) add h2=1,h2 + mov r14=r16 + br.ctop.sptk .Loop };; +.Lcend: + +{ .mii; SUM 1<<1 // back to big-endian + mov ar.lc=r3 };; + +{ .mmi; st8 [r8]=h0,16 + st8 [r9]=h1 + mov pr=r36,0x1ffff };; +{ .mmb; st8 [r8]=h2 + rum 1<<5 + br.ret.sptk b0 };; +.endp poly1305_blocks# + +.global poly1305_emit# +.proc poly1305_emit# +.align 64 +poly1305_emit: + .prologue + .save ar.pfs,r2 +{ .mmi; alloc r2=ar.pfs,3,0,0,0 + ADDP r8=0,r32 + ADDP r9=8,r32 };; + + .body +{ .mmi; ld8 r16=[r8],16 // load hash + ld8 r17=[r9] + ADDP r10=0,r34 };; +{ .mmi; ld8 r18=[r8] + ld4 r24=[r10],8 // load nonce + ADDP r11=4,r34 };; + +{ .mmi; ld4 r25=[r11],8 + ld4 r26=[r10] + add r20=5,r16 };; + +{ .mmi; ld4 r27=[r11] + cmp.ltu p6,p7=r20,r16 + shl r25=r25,32 };; +{ .mmi; +(p6) add r21=1,r17 +(p7) add r21=0,r17 +(p6) cmp.eq.or.andcm p6,p7=-1,r17 };; +{ .mmi; +(p6) add r22=1,r18 +(p7) add r22=0,r18 + shl r27=r27,32 };; +{ .mmi; or r24=r24,r25 + or r26=r26,r27 + cmp.leu p6,p7=4,r22 };; +{ .mmi; +(p6) add r16=r20,r24 +(p7) add r16=r16,r24 +(p6) add r17=r21,r26 };; +{ .mii; +(p7) add r17=r17,r26 + cmp.ltu p6,p7=r16,r24;; +(p6) add r17=1,r17 };; + +{ .mmi; ADDP r8=0,r33 + ADDP r9=4,r33 + shr.u r20=r16,32 } +{ .mmi; ADDP r10=8,r33 + ADDP r11=12,r33 + shr.u r21=r17,32 };; + +{ .mmi; st1 [r8]=r16,1 // write mac, little-endian + st1 [r9]=r20,1 + shr.u r16=r16,8 } +{ .mii; st1 [r10]=r17,1 + shr.u r20=r20,8 + shr.u r17=r17,8 } +{ .mmi; st1 [r11]=r21,1 + shr.u r21=r21,8 };; + +{ .mmi; st1 [r8]=r16,1 + st1 [r9]=r20,1 + shr.u r16=r16,8 } +{ .mii; st1 [r10]=r17,1 + shr.u r20=r20,8 + shr.u r17=r17,8 } +{ .mmi; st1 [r11]=r21,1 + shr.u r21=r21,8 };; + +{ .mmi; st1 [r8]=r16,1 + st1 [r9]=r20,1 + shr.u r16=r16,8 } +{ .mii; st1 [r10]=r17,1 + shr.u r20=r20,8 + shr.u r17=r17,8 } +{ .mmi; st1 [r11]=r21,1 + shr.u r21=r21,8 };; + +{ .mmi; st1 [r8]=r16 + st1 [r9]=r20 } +{ .mmb; st1 [r10]=r17 + st1 [r11]=r21 + br.ret.sptk b0 };; +.endp poly1305_emit# + +stringz "Poly1305 for IA64, CRYPTOGAMS by \@dot-asm" diff --git a/crypto/poly1305/asm/poly1305-mips.pl b/crypto/poly1305/asm/poly1305-mips.pl index 965825dc3eda..6c0b3292d07c 100755 --- a/crypto/poly1305/asm/poly1305-mips.pl +++ b/crypto/poly1305/asm/poly1305-mips.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -56,7 +56,11 @@ # ###################################################################### -$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +# supported flavours are o32,n32,64,nubi32,nubi64, default is o32 +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32"; die "MIPS64 only" unless ($flavour =~ /64|n32/i); @@ -431,7 +435,7 @@ poly1305_emit: ___ } -$output=pop and open STDOUT,">$output"; +$output and open STDOUT,">$output"; print $code; close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl index e5d6933ac4d5..9f9b27cac336 100755 --- a/crypto/poly1305/asm/poly1305-ppc.pl +++ b/crypto/poly1305/asm/poly1305-ppc.pl @@ -1,17 +1,17 @@ #! /usr/bin/env perl -# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. +# Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL +# project. The module is dual licensed under OpenSSL and CRYPTOGAMS +# licenses depending on where you obtain it. For further details see +# https://github.com/dot-asm/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for PowerPC. @@ -44,8 +44,18 @@ # # On side note, Power ISA 2.07 enables vector base 2^26 implementation, # and POWER8 might have capacity to break 1.0 cycle per byte barrier... +# +# January 2019 +# +# ... Unfortunately not:-( Estimate was a projection of ARM result, +# but ARM has vector multiply-n-add instruction, while PowerISA does +# not, not one usable in the context. Improvement is ~40% over -m64 +# result above and is ~1.43 on little-endian systems. -$flavour = shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour =~ /64/) { $SIZE_T =8; @@ -72,7 +82,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; -open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; +open STDOUT,"| $^X $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; $FRAME=24*$SIZE_T; @@ -99,6 +110,7 @@ $code.=<<___; std r0,0($ctx) # zero hash value std r0,8($ctx) std r0,16($ctx) + stw r0,24($ctx) # clear is_base2_26 $UCMP $inp,r0 beq- Lno_key @@ -140,6 +152,7 @@ Lno_key: .globl .poly1305_blocks .align 4 .poly1305_blocks: +Lpoly1305_blocks: srdi. $len,$len,4 beq- Labort @@ -238,60 +251,120 @@ Labort: .long 0 .byte 0,12,4,1,0x80,5,4,0 .size .poly1305_blocks,.-.poly1305_blocks +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12)); +$code.=<<___; .globl .poly1305_emit -.align 4 +.align 5 .poly1305_emit: - ld $h0,0($ctx) # load hash - ld $h1,8($ctx) - ld $h2,16($ctx) - ld $padbit,0($nonce) # load nonce - ld $nonce,8($nonce) - - addic $d0,$h0,5 # compare to modulus - addze $d1,$h1 - addze $d2,$h2 - - srdi $mask,$d2,2 # did it carry/borrow? - neg $mask,$mask + lwz $h0,0($ctx) # load hash value base 2^26 + lwz $h1,4($ctx) + lwz $h2,8($ctx) + lwz $h3,12($ctx) + lwz $h4,16($ctx) + lwz r0,24($ctx) # is_base2_26 + + sldi $h1,$h1,26 # base 2^26 -> base 2^64 + sldi $t0,$h2,52 + srdi $h2,$h2,12 + sldi $h3,$h3,14 + add $h0,$h0,$h1 + addc $h0,$h0,$t0 + sldi $t0,$h4,40 + srdi $h4,$h4,24 + adde $h1,$h2,$h3 + addc $h1,$h1,$t0 + addze $h2,$h4 + + ld $h3,0($ctx) # load hash value base 2^64 + ld $h4,8($ctx) + ld $t0,16($ctx) + + neg r0,r0 + xor $h0,$h0,$h3 # choose between radixes + xor $h1,$h1,$h4 + xor $h2,$h2,$t0 + and $h0,$h0,r0 + and $h1,$h1,r0 + and $h2,$h2,r0 + xor $h0,$h0,$h3 + xor $h1,$h1,$h4 + xor $h2,$h2,$t0 + + addic $h3,$h0,5 # compare to modulus + addze $h4,$h1 + addze $t0,$h2 + + srdi $t0,$t0,2 # see if it carried/borrowed + neg $t0,$t0 + + andc $h0,$h0,$t0 + and $h3,$h3,$t0 + andc $h1,$h1,$t0 + and $h4,$h4,$t0 + or $h0,$h0,$h3 + or $h1,$h1,$h4 + + lwz $t0,4($nonce) + lwz $h2,12($nonce) + lwz $h3,0($nonce) + lwz $h4,8($nonce) + + insrdi $h3,$t0,32,0 + insrdi $h4,$h2,32,0 + + addc $h0,$h0,$h3 # accumulate nonce + adde $h1,$h1,$h4 + + addi $ctx,$mac,-1 + addi $mac,$mac,7 + + stbu $h0,1($ctx) # write [little-endian] result + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + srdi $h0,$h0,8 + stbu $h1,1($mac) + srdi $h1,$h1,8 + + stbu $h0,1($ctx) + stbu $h1,1($mac) - andc $h0,$h0,$mask - and $d0,$d0,$mask - andc $h1,$h1,$mask - and $d1,$d1,$mask - or $h0,$h0,$d0 - or $h1,$h1,$d1 -___ -$code.=<<___ if (!$LITTLE_ENDIAN); - rotldi $padbit,$padbit,32 # flip nonce words - rotldi $nonce,$nonce,32 -___ -$code.=<<___; - addc $h0,$h0,$padbit # accumulate nonce - adde $h1,$h1,$nonce -___ -$code.=<<___ if ($LITTLE_ENDIAN); - std $h0,0($mac) # write result - std $h1,8($mac) -___ -$code.=<<___ if (!$LITTLE_ENDIAN); - extrdi r0,$h0,32,0 - li $d0,4 - stwbrx $h0,0,$mac # write result - extrdi $h0,$h1,32,0 - li $d1,8 - stwbrx r0,$d0,$mac - li $d2,12 - stwbrx $h1,$d1,$mac - stwbrx $h0,$d2,$mac -___ -$code.=<<___; blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .size .poly1305_emit,.-.poly1305_emit ___ - } else { +} } else { ############################################################################### # base 2^32 implementation @@ -309,6 +382,7 @@ $code.=<<___; stw r0,8($ctx) stw r0,12($ctx) stw r0,16($ctx) + stw r0,24($ctx) # clear is_base2_26 $UCMP $inp,r0 beq- Lno_key @@ -353,6 +427,7 @@ Lno_key: .globl .poly1305_blocks .align 4 .poly1305_blocks: +Lpoly1305_blocks: srwi. $len,$len,4 beq- Labort @@ -560,17 +635,389 @@ Labort: .long 0 .byte 0,12,4,1,0x80,18,4,0 .size .poly1305_blocks,.-.poly1305_blocks +___ +{ +my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12)); +$code.=<<___; .globl .poly1305_emit -.align 4 +.align 5 .poly1305_emit: - $STU $sp,-$FRAME($sp) + lwz r0,24($ctx) # is_base2_26 + lwz $h0,0($ctx) # load hash value + lwz $h1,4($ctx) + lwz $h2,8($ctx) + lwz $h3,12($ctx) + lwz $h4,16($ctx) + cmplwi r0,0 + beq Lemit_base2_32 + + slwi $t0,$h1,26 # base 2^26 -> base 2^32 + srwi $h1,$h1,6 + slwi $t1,$h2,20 + srwi $h2,$h2,12 + addc $h0,$h0,$t0 + slwi $t0,$h3,14 + srwi $h3,$h3,18 + adde $h1,$h1,$t1 + slwi $t1,$h4,8 + srwi $h4,$h4,24 + adde $h2,$h2,$t0 + adde $h3,$h3,$t1 + addze $h4,$h4 + +Lemit_base2_32: + addic r0,$h0,5 # compare to modulus + addze r0,$h1 + addze r0,$h2 + addze r0,$h3 + addze r0,$h4 + + srwi r0,r0,2 # see if it carried/borrowed + neg r0,r0 + andi. r0,r0,5 + + addc $h0,$h0,r0 + lwz r0,0($nonce) + addze $h1,$h1 + lwz $t0,4($nonce) + addze $h2,$h2 + lwz $t1,8($nonce) + addze $h3,$h3 + lwz $h4,12($nonce) + + addc $h0,$h0,r0 # accumulate nonce + adde $h1,$h1,$t0 + adde $h2,$h2,$t1 + adde $h3,$h3,$h4 + + addi $ctx,$mac,-1 + addi $mac,$mac,7 + + stbu $h0,1($ctx) # write [little-endian] result + srwi $h0,$h0,8 + stbu $h2,1($mac) + srwi $h2,$h2,8 + + stbu $h0,1($ctx) + srwi $h0,$h0,8 + stbu $h2,1($mac) + srwi $h2,$h2,8 + + stbu $h0,1($ctx) + srwi $h0,$h0,8 + stbu $h2,1($mac) + srwi $h2,$h2,8 + + stbu $h0,1($ctx) + stbu $h2,1($mac) + + stbu $h1,1($ctx) + srwi $h1,$h1,8 + stbu $h3,1($mac) + srwi $h3,$h3,8 + + stbu $h1,1($ctx) + srwi $h1,$h1,8 + stbu $h3,1($mac) + srwi $h3,$h3,8 + + stbu $h1,1($ctx) + srwi $h1,$h1,8 + stbu $h3,1($mac) + srwi $h3,$h3,8 + + stbu $h1,1($ctx) + stbu $h3,1($mac) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,3,0 +.size .poly1305_emit,.-.poly1305_emit +___ +} } +{{{ +######################################################################## +# PowerISA 2.07/VSX section # +######################################################################## + +my $LOCALS= 6*$SIZE_T; +my $VSXFRAME = $LOCALS + 6*$SIZE_T; + $VSXFRAME += 128; # local variables + $VSXFRAME += 12*16; # v20-v31 offload + +my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0; + +######################################################################## +# Layout of opaque area is following: +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 pad; +# unsigned __int32 is_base2_26, pad; +# unsigned __int64 r[2]; # key value base 2^64 +# struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9]; +# +# where r^n are base 2^26 digits of powers of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of +# powers is as they appear in register, not memory. + +my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4)); +my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9)); +my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14)); +my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2); +my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19)); +my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24)); +my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31)); +my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31))); +my ($ctx_,$_ctx,$const) = map("r$_",(10..12)); + + if ($flavour =~ /64/) { +############################################################################### +# setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms, +# but the base 2^26 computational part is same... + +my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31)); +my $mask = "r0"; + +$code.=<<___; +.globl .poly1305_blocks_vsx +.align 5 +.poly1305_blocks_vsx: + lwz r7,24($ctx) # is_base2_26 + cmpldi $len,128 + bge __poly1305_blocks_vsx + + neg r0,r7 # is_base2_26 as mask + lwz r7,0($ctx) # load hash base 2^26 + lwz r8,4($ctx) + lwz r9,8($ctx) + lwz r10,12($ctx) + lwz r11,16($ctx) + + sldi r8,r8,26 # base 2^26 -> base 2^64 + sldi r12,r9,52 + add r7,r7,r8 + srdi r9,r9,12 + sldi r10,r10,14 + addc r7,r7,r12 + sldi r8,r11,40 + adde r9,r9,r10 + srdi r11,r11,24 + addc r9,r9,r8 + addze r11,r11 + + ld r8,0($ctx) # load hash base 2^64 + ld r10,8($ctx) + ld r12,16($ctx) + + xor r7,r7,r8 # select between radixes + xor r9,r9,r10 + xor r11,r11,r12 + and r7,r7,r0 + and r9,r9,r0 + and r11,r11,r0 + xor r7,r7,r8 + xor r9,r9,r10 + xor r11,r11,r12 + + li r0,0 + std r7,0($ctx) # store hash base 2^64 + std r9,8($ctx) + std r11,16($ctx) + stw r0,24($ctx) # clear is_base2_26 + + b Lpoly1305_blocks + .long 0 + .byte 0,12,0x14,0,0,0,4,0 +.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + mulld $d0,$h0,$r0 # h0*r0 + mulhdu $d1,$h0,$r0 + + mulld $t0,$h1,$s1 # h1*5*r1 + mulhdu $t1,$h1,$s1 + addc $d0,$d0,$t0 + adde $d1,$d1,$t1 + + mulld $t0,$h0,$r1 # h0*r1 + mulhdu $d2,$h0,$r1 + addc $d1,$d1,$t0 + addze $d2,$d2 + + mulld $t0,$h1,$r0 # h1*r0 + mulhdu $t1,$h1,$r0 + addc $d1,$d1,$t0 + adde $d2,$d2,$t1 + + mulld $t0,$h2,$s1 # h2*5*r1 + mulld $t1,$h2,$r0 # h2*r0 + addc $d1,$d1,$t0 + adde $d2,$d2,$t1 + + andc $t0,$d2,$mask # final reduction step + and $h2,$d2,$mask + srdi $t1,$t0,2 + add $t0,$t0,$t1 + addc $h0,$d0,$t0 + addze $h1,$d1 + addze $h2,$h2 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_splat: + extrdi $d0,$h0,26,38 + extrdi $d1,$h0,26,12 + stw $d0,0x00($t1) + + extrdi $d2,$h0,12,0 + slwi $d0,$d1,2 + stw $d1,0x10($t1) + add $d0,$d0,$d1 # * 5 + stw $d0,0x20($t1) + + insrdi $d2,$h1,14,38 + slwi $d0,$d2,2 + stw $d2,0x30($t1) + add $d0,$d0,$d2 # * 5 + stw $d0,0x40($t1) + + extrdi $d1,$h1,26,24 + extrdi $d2,$h1,24,0 + slwi $d0,$d1,2 + stw $d1,0x50($t1) + add $d0,$d0,$d1 # * 5 + stw $d0,0x60($t1) + + insrdi $d2,$h2,3,37 + slwi $d0,$d2,2 + stw $d2,0x70($t1) + add $d0,$d0,$d2 # * 5 + stw $d0,0x80($t1) + + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_splat,.-__poly1305_splat + +.align 5 +__poly1305_blocks_vsx: + $STU $sp,-$VSXFRAME($sp) mflr r0 - $PUSH r28,`$FRAME-$SIZE_T*4`($sp) - $PUSH r29,`$FRAME-$SIZE_T*3`($sp) - $PUSH r30,`$FRAME-$SIZE_T*2`($sp) - $PUSH r31,`$FRAME-$SIZE_T*1`($sp) - $PUSH r0,`$FRAME+$LRSAVE`($sp) + li r10,`15+$LOCALS+128` + li r11,`31+$LOCALS+128` + mfspr r12,256 + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave + li r12,-1 + mtspr 256,r12 # preserve all AltiVec registers + $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) + $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) + $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) + $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) + $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) + $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) + + bl LPICmeup + + li $x10,0x10 + li $x20,0x20 + li $x30,0x30 + li $x40,0x40 + li $x50,0x50 + lvx_u $mask26,$x00,$const + lvx_u $_26,$x10,$const + lvx_u $_40,$x20,$const + lvx_u $I2perm,$x30,$const + lvx_u $padbits,$x40,$const + + cmplwi r7,0 # is_base2_26? + bne Lskip_init_vsx + + ld $r0,32($ctx) # load key base 2^64 + ld $r1,40($ctx) + srdi $s1,$r1,2 + li $mask,3 + add $s1,$s1,$r1 # s1 = r1 + r1>>2 + + mr $h0,$r0 # "calculate" r^1 + mr $h1,$r1 + li $h2,0 + addi $t1,$ctx,`48+(12^$BIG_ENDIAN)` + bl __poly1305_splat + + bl __poly1305_mul # calculate r^2 + addi $t1,$ctx,`48+(4^$BIG_ENDIAN)` + bl __poly1305_splat + + bl __poly1305_mul # calculate r^3 + addi $t1,$ctx,`48+(8^$BIG_ENDIAN)` + bl __poly1305_splat + + bl __poly1305_mul # calculate r^4 + addi $t1,$ctx,`48+(0^$BIG_ENDIAN)` + bl __poly1305_splat + + ld $h0,0($ctx) # load hash + ld $h1,8($ctx) + ld $h2,16($ctx) + + extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26 + extrdi $d1,$h0,26,12 + extrdi $d2,$h0,12,0 + mtvrwz $H0,$d0 + insrdi $d2,$h1,14,38 + mtvrwz $H1,$d1 + extrdi $d1,$h1,26,24 + mtvrwz $H2,$d2 + extrdi $d2,$h1,24,0 + mtvrwz $H3,$d1 + insrdi $d2,$h2,3,37 + mtvrwz $H4,$d2 +___ + } else { +############################################################################### +# 32-bit initialization + +my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12)); +my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4); + +$code.=<<___; +.globl .poly1305_blocks_vsx +.align 5 +.poly1305_blocks_vsx: + lwz r7,24($ctx) # is_base2_26 + cmplwi $len,128 + bge __poly1305_blocks_vsx + cmplwi r7,0 + beq Lpoly1305_blocks lwz $h0,0($ctx) # load hash lwz $h1,4($ctx) @@ -578,68 +1025,957 @@ Labort: lwz $h3,12($ctx) lwz $h4,16($ctx) - addic $d0,$h0,5 # compare to modulus - addze $d1,$h1 - addze $d2,$h2 - addze $d3,$h3 - addze $mask,$h4 + slwi $t0,$h1,26 # base 2^26 -> base 2^32 + srwi $h1,$h1,6 + slwi $t1,$h2,20 + srwi $h2,$h2,12 + addc $h0,$h0,$t0 + slwi $t0,$h3,14 + srwi $h3,$h3,18 + adde $h1,$h1,$t1 + slwi $t1,$h4,8 + srwi $h4,$h4,24 + adde $h2,$h2,$t0 + li $t0,0 + adde $h3,$h3,$t1 + addze $h4,$h4 - srwi $mask,$mask,2 # did it carry/borrow? - neg $mask,$mask + stw $h0,0($ctx) # store hash base 2^32 + stw $h1,4($ctx) + stw $h2,8($ctx) + stw $h3,12($ctx) + stw $h4,16($ctx) + stw $t0,24($ctx) # clear is_base2_26 - andc $h0,$h0,$mask - and $d0,$d0,$mask - andc $h1,$h1,$mask - and $d1,$d1,$mask - or $h0,$h0,$d0 - lwz $d0,0($nonce) # load nonce - andc $h2,$h2,$mask - and $d2,$d2,$mask - or $h1,$h1,$d1 - lwz $d1,4($nonce) - andc $h3,$h3,$mask - and $d3,$d3,$mask - or $h2,$h2,$d2 - lwz $d2,8($nonce) - or $h3,$h3,$d3 - lwz $d3,12($nonce) - - addc $h0,$h0,$d0 # accumulate nonce - adde $h1,$h1,$d1 - adde $h2,$h2,$d2 - adde $h3,$h3,$d3 -___ -$code.=<<___ if ($LITTLE_ENDIAN); - stw $h0,0($mac) # write result - stw $h1,4($mac) - stw $h2,8($mac) - stw $h3,12($mac) -___ -$code.=<<___ if (!$LITTLE_ENDIAN); - li $d1,4 - stwbrx $h0,0,$mac # write result - li $d2,8 - stwbrx $h1,$d1,$mac - li $d3,12 - stwbrx $h2,$d2,$mac - stwbrx $h3,$d3,$mac + b Lpoly1305_blocks + .long 0 + .byte 0,12,0x14,0,0,0,4,0 +.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx + +.align 5 +__poly1305_mul: + vmulouw $ACC0,$H0,$R0 + vmulouw $ACC1,$H1,$R0 + vmulouw $ACC2,$H2,$R0 + vmulouw $ACC3,$H3,$R0 + vmulouw $ACC4,$H4,$R0 + + vmulouw $T0,$H4,$S1 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H0,$R1 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H1,$R1 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H2,$R1 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H3,$R1 + vaddudm $ACC4,$ACC4,$T0 + + vmulouw $T0,$H3,$S2 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H4,$S2 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H0,$R2 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H1,$R2 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H2,$R2 + vaddudm $ACC4,$ACC4,$T0 + + vmulouw $T0,$H2,$S3 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H3,$S3 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H4,$S3 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H0,$R3 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H1,$R3 + vaddudm $ACC4,$ACC4,$T0 + + vmulouw $T0,$H1,$S4 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H2,$S4 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H3,$S4 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H4,$S4 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H0,$R4 + vaddudm $ACC4,$ACC4,$T0 + + ################################################################ + # lazy reduction + + vspltisb $T0,2 + vsrd $H4,$ACC3,$_26 + vsrd $H1,$ACC0,$_26 + vand $H3,$ACC3,$mask26 + vand $H0,$ACC0,$mask26 + vaddudm $H4,$H4,$ACC4 # h3 -> h4 + vaddudm $H1,$H1,$ACC1 # h0 -> h1 + + vsrd $ACC4,$H4,$_26 + vsrd $ACC1,$H1,$_26 + vand $H4,$H4,$mask26 + vand $H1,$H1,$mask26 + vaddudm $H0,$H0,$ACC4 + vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 + + vsld $ACC4,$ACC4,$T0 # <<2 + vsrd $ACC2,$H2,$_26 + vand $H2,$H2,$mask26 + vaddudm $H0,$H0,$ACC4 # h4 -> h0 + vaddudm $H3,$H3,$ACC2 # h2 -> h3 + + vsrd $ACC0,$H0,$_26 + vsrd $ACC3,$H3,$_26 + vand $H0,$H0,$mask26 + vand $H3,$H3,$mask26 + vaddudm $H1,$H1,$ACC0 # h0 -> h1 + vaddudm $H4,$H4,$ACC3 # h3 -> h4 + + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 +.size __poly1305_mul,.-__poly1305_mul + +.align 5 +__poly1305_blocks_vsx: + $STU $sp,-$VSXFRAME($sp) + mflr r0 + li r10,`15+$LOCALS+128` + li r11,`31+$LOCALS+128` + mfspr r12,256 + stvx v20,r10,$sp + addi r10,r10,32 + stvx v21,r11,$sp + addi r11,r11,32 + stvx v22,r10,$sp + addi r10,r10,32 + stvx v23,r11,$sp + addi r11,r11,32 + stvx v24,r10,$sp + addi r10,r10,32 + stvx v25,r11,$sp + addi r11,r11,32 + stvx v26,r10,$sp + addi r10,r10,32 + stvx v27,r11,$sp + addi r11,r11,32 + stvx v28,r10,$sp + addi r10,r10,32 + stvx v29,r11,$sp + addi r11,r11,32 + stvx v30,r10,$sp + stvx v31,r11,$sp + stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave + li r12,-1 + mtspr 256,r12 # preserve all AltiVec registers + $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp) + $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp) + $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp) + $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp) + $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp) + $PUSH r0,`$VSXFRAME+$LRSAVE`($sp) + + bl LPICmeup + + li $x10,0x10 + li $x20,0x20 + li $x30,0x30 + li $x40,0x40 + li $x50,0x50 + lvx_u $mask26,$x00,$const + lvx_u $_26,$x10,$const + lvx_u $_40,$x20,$const + lvx_u $I2perm,$x30,$const + lvx_u $padbits,$x40,$const + + cmplwi r7,0 # is_base2_26? + bne Lskip_init_vsx + + lwz $h1,32($ctx) # load key base 2^32 + lwz $h2,36($ctx) + lwz $h3,40($ctx) + lwz $h4,44($ctx) + + extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 + extrwi $h1,$h1,6,0 + insrwi $h1,$h2,20,6 + extrwi $h2,$h2,12,0 + insrwi $h2,$h3,14,6 + extrwi $h3,$h3,18,0 + insrwi $h3,$h4,8,6 + extrwi $h4,$h4,24,0 + + mtvrwz $R0,$h0 + slwi $h0,$h1,2 + mtvrwz $R1,$h1 + add $h1,$h1,$h0 + mtvrwz $S1,$h1 + slwi $h1,$h2,2 + mtvrwz $R2,$h2 + add $h2,$h2,$h1 + mtvrwz $S2,$h2 + slwi $h2,$h3,2 + mtvrwz $R3,$h3 + add $h3,$h3,$h2 + mtvrwz $S3,$h3 + slwi $h3,$h4,2 + mtvrwz $R4,$h4 + add $h4,$h4,$h3 + mtvrwz $S4,$h4 + + vmr $H0,$R0 + vmr $H1,$R1 + vmr $H2,$R2 + vmr $H3,$R3 + vmr $H4,$R4 + + bl __poly1305_mul # r^1:- * r^1:- + + vpermdi $R0,$H0,$R0,0b00 + vpermdi $R1,$H1,$R1,0b00 + vpermdi $R2,$H2,$R2,0b00 + vpermdi $R3,$H3,$R3,0b00 + vpermdi $R4,$H4,$R4,0b00 + vpermdi $H0,$H0,$H0,0b00 + vpermdi $H1,$H1,$H1,0b00 + vpermdi $H2,$H2,$H2,0b00 + vpermdi $H3,$H3,$H3,0b00 + vpermdi $H4,$H4,$H4,0b00 + vsld $S1,$R1,$T0 # <<2 + vsld $S2,$R2,$T0 + vsld $S3,$R3,$T0 + vsld $S4,$R4,$T0 + vaddudm $S1,$S1,$R1 + vaddudm $S2,$S2,$R2 + vaddudm $S3,$S3,$R3 + vaddudm $S4,$S4,$R4 + + bl __poly1305_mul # r^2:r^2 * r^2:r^1 + + addi $h0,$ctx,0x60 + lwz $h1,0($ctx) # load hash + lwz $h2,4($ctx) + lwz $h3,8($ctx) + lwz $h4,12($ctx) + lwz $t0,16($ctx) + + vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3 + vmrgow $R1,$R1,$H1 + vmrgow $R2,$R2,$H2 + vmrgow $R3,$R3,$H3 + vmrgow $R4,$R4,$H4 + vslw $S1,$R1,$T0 # <<2 + vslw $S2,$R2,$T0 + vslw $S3,$R3,$T0 + vslw $S4,$R4,$T0 + vadduwm $S1,$S1,$R1 + vadduwm $S2,$S2,$R2 + vadduwm $S3,$S3,$R3 + vadduwm $S4,$S4,$R4 + + stvx_u $R0,$x30,$ctx + stvx_u $R1,$x40,$ctx + stvx_u $S1,$x50,$ctx + stvx_u $R2,$x00,$h0 + stvx_u $S2,$x10,$h0 + stvx_u $R3,$x20,$h0 + stvx_u $S3,$x30,$h0 + stvx_u $R4,$x40,$h0 + stvx_u $S4,$x50,$h0 + + extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26 + extrwi $h1,$h1,6,0 + mtvrwz $H0,$h0 + insrwi $h1,$h2,20,6 + extrwi $h2,$h2,12,0 + mtvrwz $H1,$h1 + insrwi $h2,$h3,14,6 + extrwi $h3,$h3,18,0 + mtvrwz $H2,$h2 + insrwi $h3,$h4,8,6 + extrwi $h4,$h4,24,0 + mtvrwz $H3,$h3 + insrwi $h4,$t0,3,5 + mtvrwz $H4,$h4 ___ + } $code.=<<___; - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - addi $sp,$sp,$FRAME + li r0,1 + stw r0,24($ctx) # set is_base2_26 + b Loaded_vsx + +.align 4 +Lskip_init_vsx: + li $x10,4 + li $x20,8 + li $x30,12 + li $x40,16 + lvwzx_u $H0,$x00,$ctx + lvwzx_u $H1,$x10,$ctx + lvwzx_u $H2,$x20,$ctx + lvwzx_u $H3,$x30,$ctx + lvwzx_u $H4,$x40,$ctx + +Loaded_vsx: + li $x10,0x10 + li $x20,0x20 + li $x30,0x30 + li $x40,0x40 + li $x50,0x50 + li $x60,0x60 + li $x70,0x70 + addi $ctx_,$ctx,64 # &ctx->r[1] + addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow + + vxor $T0,$T0,$T0 # ensure second half is zero + vpermdi $H0,$H0,$T0,0b00 + vpermdi $H1,$H1,$T0,0b00 + vpermdi $H2,$H2,$T0,0b00 + vpermdi $H3,$H3,$T0,0b00 + vpermdi $H4,$H4,$T0,0b00 + + be?lvx_u $_4,$x50,$const # byte swap mask + lvx_u $T1,$x00,$inp # load first input block + lvx_u $T2,$x10,$inp + lvx_u $T3,$x20,$inp + lvx_u $T4,$x30,$inp + be?vperm $T1,$T1,$T1,$_4 + be?vperm $T2,$T2,$T2,$_4 + be?vperm $T3,$T3,$T3,$_4 + be?vperm $T4,$T4,$T4,$_4 + + vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 + vspltisb $_4,4 + vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 + vspltisb $_14,14 + vpermdi $I3,$T1,$T2,0b11 + + vsrd $I1,$I0,$_26 + vsrd $I2,$I2,$_4 + vsrd $I4,$I3,$_40 + vsrd $I3,$I3,$_14 + vand $I0,$I0,$mask26 + vand $I1,$I1,$mask26 + vand $I2,$I2,$mask26 + vand $I3,$I3,$mask26 + + vpermdi $T1,$T3,$T4,0b00 + vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 + vpermdi $T3,$T3,$T4,0b11 + + vsrd $T0,$T1,$_26 + vsrd $T2,$T2,$_4 + vsrd $T4,$T3,$_40 + vsrd $T3,$T3,$_14 + vand $T1,$T1,$mask26 + vand $T0,$T0,$mask26 + vand $T2,$T2,$mask26 + vand $T3,$T3,$mask26 + + # inp[2]:inp[0]:inp[3]:inp[1] + vmrgow $I4,$T4,$I4 + vmrgow $I0,$T1,$I0 + vmrgow $I1,$T0,$I1 + vmrgow $I2,$T2,$I2 + vmrgow $I3,$T3,$I3 + vor $I4,$I4,$padbits + + lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop + lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement + lvx_splt $S1,$x10,$ctx_ + lvx_splt $R2,$x20,$ctx_ + lvx_splt $S2,$x30,$ctx_ + lvx_splt $T1,$x40,$ctx_ + lvx_splt $T2,$x50,$ctx_ + lvx_splt $T3,$x60,$ctx_ + lvx_splt $T4,$x70,$ctx_ + stvx $R1,$x00,$_ctx + stvx $S1,$x10,$_ctx + stvx $R2,$x20,$_ctx + stvx $S2,$x30,$_ctx + stvx $T1,$x40,$_ctx + stvx $T2,$x50,$_ctx + stvx $T3,$x60,$_ctx + stvx $T4,$x70,$_ctx + + addi $inp,$inp,0x40 + addi $const,$const,0x50 + addi r0,$len,-64 + srdi r0,r0,6 + mtctr r0 + b Loop_vsx + +.align 4 +Loop_vsx: + ################################################################ + ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + ## \___________________/ + ## + ## Note that we start with inp[2:3]*r^2. This is because it + ## doesn't depend on reduction in previous iteration. + ################################################################ + ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vmuleuw $ACC0,$I0,$R0 + vmuleuw $ACC1,$I0,$R1 + vmuleuw $ACC2,$I0,$R2 + vmuleuw $ACC3,$I1,$R2 + + vmuleuw $T0,$I1,$R0 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I1,$R1 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $ACC4,$I2,$R2 + vmuleuw $T0,$I4,$S1 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I2,$R1 + vaddudm $ACC3,$ACC3,$T0 + lvx $S3,$x50,$_ctx + vmuleuw $T0,$I3,$R1 + vaddudm $ACC4,$ACC4,$T0 + lvx $R3,$x40,$_ctx + + vaddudm $H2,$H2,$I2 + vaddudm $H0,$H0,$I0 + vaddudm $H3,$H3,$I3 + vaddudm $H1,$H1,$I1 + vaddudm $H4,$H4,$I4 + + vmuleuw $T0,$I3,$S2 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I4,$S2 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I2,$R0 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I3,$R0 + vaddudm $ACC3,$ACC3,$T0 + lvx $S4,$x70,$_ctx + vmuleuw $T0,$I4,$R0 + vaddudm $ACC4,$ACC4,$T0 + lvx $R4,$x60,$_ctx + + vmuleuw $T0,$I2,$S3 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I3,$S3 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I4,$S3 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I0,$R3 + vaddudm $ACC3,$ACC3,$T0 + vmuleuw $T0,$I1,$R3 + vaddudm $ACC4,$ACC4,$T0 + + be?lvx_u $_4,$x00,$const # byte swap mask + lvx_u $T1,$x00,$inp # load next input block + lvx_u $T2,$x10,$inp + lvx_u $T3,$x20,$inp + lvx_u $T4,$x30,$inp + be?vperm $T1,$T1,$T1,$_4 + be?vperm $T2,$T2,$T2,$_4 + be?vperm $T3,$T3,$T3,$_4 + be?vperm $T4,$T4,$T4,$_4 + + vmuleuw $T0,$I1,$S4 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I2,$S4 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I3,$S4 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I4,$S4 + vaddudm $ACC3,$ACC3,$T0 + vmuleuw $T0,$I0,$R4 + vaddudm $ACC4,$ACC4,$T0 + + vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 + vspltisb $_4,4 + vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 + vpermdi $I3,$T1,$T2,0b11 + + # (hash + inp[0:1]) * r^4 + vmulouw $T0,$H0,$R0 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H1,$R0 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H2,$R0 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H3,$R0 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H4,$R0 + vaddudm $ACC4,$ACC4,$T0 + + vpermdi $T1,$T3,$T4,0b00 + vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 + vpermdi $T3,$T3,$T4,0b11 + + vmulouw $T0,$H2,$S3 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H3,$S3 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H4,$S3 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H0,$R3 + vaddudm $ACC3,$ACC3,$T0 + lvx $S1,$x10,$_ctx + vmulouw $T0,$H1,$R3 + vaddudm $ACC4,$ACC4,$T0 + lvx $R1,$x00,$_ctx + + vsrd $I1,$I0,$_26 + vsrd $I2,$I2,$_4 + vsrd $I4,$I3,$_40 + vsrd $I3,$I3,$_14 + + vmulouw $T0,$H1,$S4 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H2,$S4 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H3,$S4 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H4,$S4 + vaddudm $ACC3,$ACC3,$T0 + lvx $S2,$x30,$_ctx + vmulouw $T0,$H0,$R4 + vaddudm $ACC4,$ACC4,$T0 + lvx $R2,$x20,$_ctx + + vand $I0,$I0,$mask26 + vand $I1,$I1,$mask26 + vand $I2,$I2,$mask26 + vand $I3,$I3,$mask26 + + vmulouw $T0,$H4,$S1 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H0,$R1 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H1,$R1 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H2,$R1 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H3,$R1 + vaddudm $ACC4,$ACC4,$T0 + + vsrd $T2,$T2,$_4 + vsrd $_4,$T1,$_26 + vsrd $T4,$T3,$_40 + vsrd $T3,$T3,$_14 + + vmulouw $T0,$H3,$S2 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H4,$S2 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H0,$R2 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H1,$R2 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H2,$R2 + vaddudm $ACC4,$ACC4,$T0 + + vand $T1,$T1,$mask26 + vand $_4,$_4,$mask26 + vand $T2,$T2,$mask26 + vand $T3,$T3,$mask26 + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vspltisb $T0,2 + vsrd $H4,$ACC3,$_26 + vsrd $H1,$ACC0,$_26 + vand $H3,$ACC3,$mask26 + vand $H0,$ACC0,$mask26 + vaddudm $H4,$H4,$ACC4 # h3 -> h4 + vaddudm $H1,$H1,$ACC1 # h0 -> h1 + + vmrgow $I4,$T4,$I4 + vmrgow $I0,$T1,$I0 + vmrgow $I1,$_4,$I1 + vmrgow $I2,$T2,$I2 + vmrgow $I3,$T3,$I3 + vor $I4,$I4,$padbits + + vsrd $ACC4,$H4,$_26 + vsrd $ACC1,$H1,$_26 + vand $H4,$H4,$mask26 + vand $H1,$H1,$mask26 + vaddudm $H0,$H0,$ACC4 + vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 + + vsld $ACC4,$ACC4,$T0 # <<2 + vsrd $ACC2,$H2,$_26 + vand $H2,$H2,$mask26 + vaddudm $H0,$H0,$ACC4 # h4 -> h0 + vaddudm $H3,$H3,$ACC2 # h2 -> h3 + + vsrd $ACC0,$H0,$_26 + vsrd $ACC3,$H3,$_26 + vand $H0,$H0,$mask26 + vand $H3,$H3,$mask26 + vaddudm $H1,$H1,$ACC0 # h0 -> h1 + vaddudm $H4,$H4,$ACC3 # h3 -> h4 + + addi $inp,$inp,0x40 + bdnz Loop_vsx + + neg $len,$len + andi. $len,$len,0x30 + sub $inp,$inp,$len + + lvx_u $R0,$x30,$ctx # load all powers + lvx_u $R1,$x00,$ctx_ + lvx_u $S1,$x10,$ctx_ + lvx_u $R2,$x20,$ctx_ + lvx_u $S2,$x30,$ctx_ + +Last_vsx: + vmuleuw $ACC0,$I0,$R0 + vmuleuw $ACC1,$I1,$R0 + vmuleuw $ACC2,$I2,$R0 + vmuleuw $ACC3,$I3,$R0 + vmuleuw $ACC4,$I4,$R0 + + vmuleuw $T0,$I4,$S1 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I0,$R1 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I1,$R1 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I2,$R1 + vaddudm $ACC3,$ACC3,$T0 + lvx_u $S3,$x50,$ctx_ + vmuleuw $T0,$I3,$R1 + vaddudm $ACC4,$ACC4,$T0 + lvx_u $R3,$x40,$ctx_ + + vaddudm $H2,$H2,$I2 + vaddudm $H0,$H0,$I0 + vaddudm $H3,$H3,$I3 + vaddudm $H1,$H1,$I1 + vaddudm $H4,$H4,$I4 + + vmuleuw $T0,$I3,$S2 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I4,$S2 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I0,$R2 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I1,$R2 + vaddudm $ACC3,$ACC3,$T0 + lvx_u $S4,$x70,$ctx_ + vmuleuw $T0,$I2,$R2 + vaddudm $ACC4,$ACC4,$T0 + lvx_u $R4,$x60,$ctx_ + + vmuleuw $T0,$I2,$S3 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I3,$S3 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I4,$S3 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I0,$R3 + vaddudm $ACC3,$ACC3,$T0 + vmuleuw $T0,$I1,$R3 + vaddudm $ACC4,$ACC4,$T0 + + vmuleuw $T0,$I1,$S4 + vaddudm $ACC0,$ACC0,$T0 + vmuleuw $T0,$I2,$S4 + vaddudm $ACC1,$ACC1,$T0 + vmuleuw $T0,$I3,$S4 + vaddudm $ACC2,$ACC2,$T0 + vmuleuw $T0,$I4,$S4 + vaddudm $ACC3,$ACC3,$T0 + vmuleuw $T0,$I0,$R4 + vaddudm $ACC4,$ACC4,$T0 + + # (hash + inp[0:1]) * r^4 + vmulouw $T0,$H0,$R0 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H1,$R0 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H2,$R0 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H3,$R0 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H4,$R0 + vaddudm $ACC4,$ACC4,$T0 + + vmulouw $T0,$H2,$S3 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H3,$S3 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H4,$S3 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H0,$R3 + vaddudm $ACC3,$ACC3,$T0 + lvx_u $S1,$x10,$ctx_ + vmulouw $T0,$H1,$R3 + vaddudm $ACC4,$ACC4,$T0 + lvx_u $R1,$x00,$ctx_ + + vmulouw $T0,$H1,$S4 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H2,$S4 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H3,$S4 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H4,$S4 + vaddudm $ACC3,$ACC3,$T0 + lvx_u $S2,$x30,$ctx_ + vmulouw $T0,$H0,$R4 + vaddudm $ACC4,$ACC4,$T0 + lvx_u $R2,$x20,$ctx_ + + vmulouw $T0,$H4,$S1 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H0,$R1 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H1,$R1 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H2,$R1 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H3,$R1 + vaddudm $ACC4,$ACC4,$T0 + + vmulouw $T0,$H3,$S2 + vaddudm $ACC0,$ACC0,$T0 + vmulouw $T0,$H4,$S2 + vaddudm $ACC1,$ACC1,$T0 + vmulouw $T0,$H0,$R2 + vaddudm $ACC2,$ACC2,$T0 + vmulouw $T0,$H1,$R2 + vaddudm $ACC3,$ACC3,$T0 + vmulouw $T0,$H2,$R2 + vaddudm $ACC4,$ACC4,$T0 + + ################################################################ + # horizontal addition + + vpermdi $H0,$ACC0,$ACC0,0b10 + vpermdi $H1,$ACC1,$ACC1,0b10 + vpermdi $H2,$ACC2,$ACC2,0b10 + vpermdi $H3,$ACC3,$ACC3,0b10 + vpermdi $H4,$ACC4,$ACC4,0b10 + vaddudm $ACC0,$ACC0,$H0 + vaddudm $ACC1,$ACC1,$H1 + vaddudm $ACC2,$ACC2,$H2 + vaddudm $ACC3,$ACC3,$H3 + vaddudm $ACC4,$ACC4,$H4 + + ################################################################ + # lazy reduction + + vspltisb $T0,2 + vsrd $H4,$ACC3,$_26 + vsrd $H1,$ACC0,$_26 + vand $H3,$ACC3,$mask26 + vand $H0,$ACC0,$mask26 + vaddudm $H4,$H4,$ACC4 # h3 -> h4 + vaddudm $H1,$H1,$ACC1 # h0 -> h1 + + vsrd $ACC4,$H4,$_26 + vsrd $ACC1,$H1,$_26 + vand $H4,$H4,$mask26 + vand $H1,$H1,$mask26 + vaddudm $H0,$H0,$ACC4 + vaddudm $H2,$ACC2,$ACC1 # h1 -> h2 + + vsld $ACC4,$ACC4,$T0 # <<2 + vsrd $ACC2,$H2,$_26 + vand $H2,$H2,$mask26 + vaddudm $H0,$H0,$ACC4 # h4 -> h0 + vaddudm $H3,$H3,$ACC2 # h2 -> h3 + + vsrd $ACC0,$H0,$_26 + vsrd $ACC3,$H3,$_26 + vand $H0,$H0,$mask26 + vand $H3,$H3,$mask26 + vaddudm $H1,$H1,$ACC0 # h0 -> h1 + vaddudm $H4,$H4,$ACC3 # h3 -> h4 + + beq Ldone_vsx + + add r6,$const,$len + + be?lvx_u $_4,$x00,$const # byte swap mask + lvx_u $T1,$x00,$inp # load last partial input block + lvx_u $T2,$x10,$inp + lvx_u $T3,$x20,$inp + lvx_u $T4,$x30,$inp + be?vperm $T1,$T1,$T1,$_4 + be?vperm $T2,$T2,$T2,$_4 + be?vperm $T3,$T3,$T3,$_4 + be?vperm $T4,$T4,$T4,$_4 + + vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26 + vspltisb $_4,4 + vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011 + vpermdi $I3,$T1,$T2,0b11 + + vsrd $I1,$I0,$_26 + vsrd $I2,$I2,$_4 + vsrd $I4,$I3,$_40 + vsrd $I3,$I3,$_14 + vand $I0,$I0,$mask26 + vand $I1,$I1,$mask26 + vand $I2,$I2,$mask26 + vand $I3,$I3,$mask26 + + vpermdi $T0,$T3,$T4,0b00 + vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011 + vpermdi $T2,$T3,$T4,0b11 + + lvx_u $ACC0,$x00,r6 + lvx_u $ACC1,$x30,r6 + + vsrd $T3,$T0,$_26 + vsrd $T1,$T1,$_4 + vsrd $T4,$T2,$_40 + vsrd $T2,$T2,$_14 + vand $T0,$T0,$mask26 + vand $T3,$T3,$mask26 + vand $T1,$T1,$mask26 + vand $T2,$T2,$mask26 + + # inp[2]:inp[0]:inp[3]:inp[1] + vmrgow $I4,$T4,$I4 + vmrgow $I0,$T0,$I0 + vmrgow $I1,$T3,$I1 + vmrgow $I2,$T1,$I2 + vmrgow $I3,$T2,$I3 + vor $I4,$I4,$padbits + + vperm $H0,$H0,$H0,$ACC0 # move hash to right lane + vand $I0,$I0, $ACC1 # mask redundant input lane[s] + vperm $H1,$H1,$H1,$ACC0 + vand $I1,$I1, $ACC1 + vperm $H2,$H2,$H2,$ACC0 + vand $I2,$I2, $ACC1 + vperm $H3,$H3,$H3,$ACC0 + vand $I3,$I3, $ACC1 + vperm $H4,$H4,$H4,$ACC0 + vand $I4,$I4, $ACC1 + + vaddudm $I0,$I0,$H0 # accumulate hash + vxor $H0,$H0,$H0 # wipe hash value + vaddudm $I1,$I1,$H1 + vxor $H1,$H1,$H1 + vaddudm $I2,$I2,$H2 + vxor $H2,$H2,$H2 + vaddudm $I3,$I3,$H3 + vxor $H3,$H3,$H3 + vaddudm $I4,$I4,$H4 + vxor $H4,$H4,$H4 + + xor. $len,$len,$len + b Last_vsx + +.align 4 +Ldone_vsx: + $POP r0,`$VSXFRAME+$LRSAVE`($sp) + li $x10,4 + li $x20,8 + li $x30,12 + li $x40,16 + stvwx_u $H0,$x00,$ctx # store hash + stvwx_u $H1,$x10,$ctx + stvwx_u $H2,$x20,$ctx + stvwx_u $H3,$x30,$ctx + stvwx_u $H4,$x40,$ctx + + lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave + mtlr r0 + li r10,`15+$LOCALS+128` + li r11,`31+$LOCALS+128` + mtspr 256,r12 # restore vrsave + lvx v20,r10,$sp + addi r10,r10,32 + lvx v21,r11,$sp + addi r11,r11,32 + lvx v22,r10,$sp + addi r10,r10,32 + lvx v23,r11,$sp + addi r11,r11,32 + lvx v24,r10,$sp + addi r10,r10,32 + lvx v25,r11,$sp + addi r11,r11,32 + lvx v26,r10,$sp + addi r10,r10,32 + lvx v27,r11,$sp + addi r11,r11,32 + lvx v28,r10,$sp + addi r10,r10,32 + lvx v29,r11,$sp + addi r11,r11,32 + lvx v30,r10,$sp + lvx v31,r11,$sp + $POP r27,`$VSXFRAME-$SIZE_T*5`($sp) + $POP r28,`$VSXFRAME-$SIZE_T*4`($sp) + $POP r29,`$VSXFRAME-$SIZE_T*3`($sp) + $POP r30,`$VSXFRAME-$SIZE_T*2`($sp) + $POP r31,`$VSXFRAME-$SIZE_T*1`($sp) + addi $sp,$sp,$VSXFRAME blr .long 0 - .byte 0,12,4,1,0x80,4,3,0 -.size .poly1305_emit,.-.poly1305_emit + .byte 0,12,0x04,1,0x80,5,4,0 + .long 0 +.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx + +.align 6 +LPICmeup: + mflr r0 + bcl 20,31,\$+4 + mflr $const # vvvvvv "distance" between . and 1st data entry + addi $const,$const,`64-8` + mtlr r0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` + +.quad 0x0000000003ffffff,0x0000000003ffffff # mask26 +.quad 0x000000000000001a,0x000000000000001a # _26 +.quad 0x0000000000000028,0x0000000000000028 # _40 +.quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm +.quad 0x0100000001000000,0x0100000001000000 # padbits +.quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian + +.quad 0x0000000000000000,0x0000000004050607 # magic tail masks +.quad 0x0405060700000000,0x0000000000000000 +.quad 0x0000000000000000,0x0405060700000000 + +.quad 0xffffffff00000000,0xffffffffffffffff +.quad 0xffffffff00000000,0xffffffff00000000 +.quad 0x0000000000000000,0xffffffff00000000 ___ - } +}}} $code.=<<___; -.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>" +.asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm" ___ -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + # instructions prefixed with '?' are endian-specific and need + # to be adjusted accordingly... + if ($flavour !~ /le$/) { # big-endian + s/be\?// or + s/le\?/#le#/ + } else { # little-endian + s/le\?// or + s/be\?/#be#/ + } + + print $_,"\n"; +} close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl index a9ab20714697..218708a46257 100755 --- a/crypto/poly1305/asm/poly1305-ppcfp.pl +++ b/crypto/poly1305/asm/poly1305-ppcfp.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -27,7 +27,10 @@ # POWER7 3.50/+30% # POWER8 3.75/+10% -$flavour = shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour =~ /64/) { $SIZE_T =8; @@ -54,7 +57,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; -open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; +open STDOUT,"| $^X $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; $LOCALS=6*$SIZE_T; $FRAME=$LOCALS+6*8+18*8; diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl index bcc8fd3b886a..4a93064ff663 100755 --- a/crypto/poly1305/asm/poly1305-s390x.pl +++ b/crypto/poly1305/asm/poly1305-s390x.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -24,204 +24,961 @@ # # On side note, z13 enables vector base 2^26 implementation... -$flavour = shift; +# +# January 2019 +# +# Add vx code path (base 2^26). +# +# Copyright IBM Corp. 2019 +# Author: Patrick Steuer <patrick.steuer@de.ibm.com> +# +# January 2019 +# +# Add vector base 2^26 implementation. It's problematic to accurately +# measure performance, because reference system is hardly idle. But +# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's +# >=20% faster than IBM's submission on long inputs, and much faster on +# short ones, because calculation of key powers is postponed till we +# know that input is long enough to justify the additional overhead. + +use strict; +use FindBin qw($Bin); +use lib "$Bin/../.."; +use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE); + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +my ($z,$SIZE_T); if ($flavour =~ /3[12]/) { + $z=0; # S/390 ABI $SIZE_T=4; - $g=""; } else { + $z=1; # zSeries ABI $SIZE_T=8; - $g="g"; } -while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; - -$sp="%r15"; +my $stdframe=16*$SIZE_T+4*8; +my $sp="%r15"; my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5)); -$code.=<<___; -.text - -.globl poly1305_init -.type poly1305_init,\@function -.align 16 -poly1305_init: - lghi %r0,0 - lghi %r1,-1 - stg %r0,0($ctx) # zero hash value - stg %r0,8($ctx) - stg %r0,16($ctx) - - cl${g}r $inp,%r0 - je .Lno_key - - lrvg %r4,0($inp) # load little-endian key - lrvg %r5,8($inp) - - nihl %r1,0xffc0 # 0xffffffc0ffffffff - srlg %r0,%r1,4 # 0x0ffffffc0fffffff - srlg %r1,%r1,4 - nill %r1,0xfffc # 0x0ffffffc0ffffffc - - ngr %r4,%r0 - ngr %r5,%r1 - - stg %r4,32($ctx) - stg %r5,40($ctx) - -.Lno_key: - lghi %r2,0 - br %r14 -.size poly1305_init,.-poly1305_init -___ +PERLASM_BEGIN($output); + +INCLUDE ("s390x_arch.h"); +TEXT (); + +################ +# static void poly1305_init(void *ctx, const unsigned char key[16]) +{ +GLOBL ("poly1305_init"); +TYPE ("poly1305_init","\@function"); +ALIGN (16); +LABEL ("poly1305_init"); + lghi ("%r0",0); + lghi ("%r1",-1); + stg ("%r0","0($ctx)"); # zero hash value + stg ("%r0","8($ctx)"); + stg ("%r0","16($ctx)"); + st ("%r0","24($ctx)"); # clear is_base2_26 + lgr ("%r5",$ctx); # reassign $ctx + lghi ("%r2",0); + +&{$z? \&clgr:\&clr} ($inp,"%r0"); + je (".Lno_key"); + + lrvg ("%r2","0($inp)"); # load little-endian key + lrvg ("%r3","8($inp)"); + + nihl ("%r1",0xffc0); # 0xffffffc0ffffffff + srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff + srlg ("%r1","%r1",4); + nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc + + ngr ("%r2","%r0"); + ngr ("%r3","%r1"); + + stmg ("%r2","%r3","32(%r5)"); + + larl ("%r1","OPENSSL_s390xcap_P"); + lg ("%r0","16(%r1)"); + srlg ("%r0","%r0",62); + nill ("%r0",1); # extract vx bit + lcgr ("%r0","%r0"); + larl ("%r1",".Lpoly1305_blocks"); + larl ("%r2",".Lpoly1305_blocks_vx"); + larl ("%r3",".Lpoly1305_emit"); +&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector +&{$z? \&ngr:\&nr} ("%r2","%r0"); +&{$z? \&xgr:\&xr} ("%r2","%r1"); +&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)"); + lghi ("%r2",1); +LABEL (".Lno_key"); + br ("%r14"); +SIZE ("poly1305_init",".-poly1305_init"); +} + +################ +# static void poly1305_blocks(void *ctx, const unsigned char *inp, +# size_t len, u32 padbit) { my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14)); my ($r0,$r1,$s1) = map("%r$_",(0..2)); -$code.=<<___; -.globl poly1305_blocks -.type poly1305_blocks,\@function -.align 16 -poly1305_blocks: - srl${g} $len,4 # fixed-up in 64-bit build - lghi %r0,0 - cl${g}r $len,%r0 - je .Lno_data - - stm${g} %r6,%r14,`6*$SIZE_T`($sp) - - llgfr $padbit,$padbit # clear upper half, much needed with - # non-64-bit ABI - lg $r0,32($ctx) # load key - lg $r1,40($ctx) - - lg $h0,0($ctx) # load hash value - lg $h1,8($ctx) - lg $h2,16($ctx) - - st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx - srlg $s1,$r1,2 - algr $s1,$r1 # s1 = r1 + r1>>2 - j .Loop - -.align 16 -.Loop: - lrvg $d0lo,0($inp) # load little-endian input - lrvg $d1lo,8($inp) - la $inp,16($inp) - - algr $d0lo,$h0 # accumulate input - alcgr $d1lo,$h1 - - lgr $h0,$d0lo - mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo - lgr $h1,$d1lo - mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo - - mlgr $t0,$r1 # h0*r1 -> $t0:$h0 - mlgr $t1,$r0 # h1*r0 -> $t1:$h1 - alcgr $h2,$padbit - - algr $d0lo,$d1lo - lgr $d1lo,$h2 - alcgr $d0hi,$d1hi - lghi $d1hi,0 - - algr $h1,$h0 - alcgr $t1,$t0 - - msgr $d1lo,$s1 # h2*s1 - msgr $h2,$r0 # h2*r0 - - algr $h1,$d1lo - alcgr $t1,$d1hi # $d1hi is zero - - algr $h1,$d0hi - alcgr $h2,$t1 - - lghi $h0,-4 # final reduction step - ngr $h0,$h2 - srlg $t0,$h2,2 - algr $h0,$t0 - lghi $t1,3 - ngr $h2,$t1 - - algr $h0,$d0lo - alcgr $h1,$d1hi # $d1hi is still zero - alcgr $h2,$d1hi # $d1hi is still zero - - brct$g $len,.Loop - - l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx - - stg $h0,0($ctx) # store hash value - stg $h1,8($ctx) - stg $h2,16($ctx) - - lm${g} %r6,%r14,`6*$SIZE_T`($sp) -.Lno_data: - br %r14 -.size poly1305_blocks,.-poly1305_blocks -___ +GLOBL ("poly1305_blocks"); +TYPE ("poly1305_blocks","\@function"); +ALIGN (16); +LABEL ("poly1305_blocks"); +LABEL (".Lpoly1305_blocks"); +&{$z? \<gr:\<r} ("%r0",$len); + jz (".Lno_data"); + +&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); + + lg ($h0,"0($ctx)"); # load hash value + lg ($h1,"8($ctx)"); + lg ($h2,"16($ctx)"); + +LABEL (".Lpoly1305_blocks_entry"); +if ($z) { + srlg ($len,$len,4); +} else { + srl ($len,4); +} + llgfr ($padbit,$padbit); # clear upper half, much needed with + # non-64-bit ABI + lg ($r0,"32($ctx)"); # load key + lg ($r1,"40($ctx)"); + +&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx + srlg ($s1,$r1,2); + algr ($s1,$r1); # s1 = r1 + r1>>2 + j (".Loop"); + +ALIGN (16); +LABEL (".Loop"); + lrvg ($d0lo,"0($inp)"); # load little-endian input + lrvg ($d1lo,"8($inp)"); + la ($inp,"16($inp)"); + + algr ($d0lo,$h0); # accumulate input + alcgr ($d1lo,$h1); + alcgr ($h2,$padbit); + + lgr ($h0,$d0lo); + mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo + lgr ($h1,$d1lo); + mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo + + mlgr ($t0,$r1); # h0*r1 -> $t0:$h0 + mlgr ($t1,$r0); # h1*r0 -> $t1:$h1 + + algr ($d0lo,$d1lo); + lgr ($d1lo,$h2); + alcgr ($d0hi,$d1hi); + lghi ($d1hi,0); + + algr ($h1,$h0); + alcgr ($t1,$t0); + + msgr ($d1lo,$s1); # h2*s1 + msgr ($h2,$r0); # h2*r0 + + algr ($h1,$d1lo); + alcgr ($t1,$d1hi); # $d1hi is zero + + algr ($h1,$d0hi); + alcgr ($h2,$t1); + + lghi ($h0,-4); # final reduction step + ngr ($h0,$h2); + srlg ($t0,$h2,2); + algr ($h0,$t0); + lghi ($t1,3); + ngr ($h2,$t1); + + algr ($h0,$d0lo); + alcgr ($h1,$d1hi); # $d1hi is still zero + alcgr ($h2,$d1hi); # $d1hi is still zero + +&{$z? \&brctg:\&brct} ($len,".Loop"); + +&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx + + stg ($h0,"0($ctx)"); # store hash value + stg ($h1,"8($ctx)"); + stg ($h2,"16($ctx)"); + +&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)"); +LABEL (".Lno_data"); + br ("%r14"); +SIZE ("poly1305_blocks",".-poly1305_blocks"); +} + +################ +# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp, +# size_t len, u32 padbit) +{ +my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4)); +my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9)); +my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14)); +my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18)); +my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23)); +my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27)); +my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31)); + +my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14)); + +TYPE ("poly1305_blocks_vx","\@function"); +ALIGN (16); +LABEL ("poly1305_blocks_vx"); +LABEL (".Lpoly1305_blocks_vx"); +&{$z? \&clgfi:\&clfi} ($len,128); + jhe ("__poly1305_blocks_vx"); + +&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)"); + + lg ($d0,"0($ctx)"); + lg ($d1,"8($ctx)"); + lg ($d2,"16($ctx)"); + + llgfr ("%r0",$d0); # base 2^26 -> base 2^64 + srlg ($h0,$d0,32); + llgfr ("%r1",$d1); + srlg ($h1,$d1,32); + srlg ($h2,$d2,32); + + sllg ("%r0","%r0",26); + algr ($h0,"%r0"); + sllg ("%r0",$h1,52); + srlg ($h1,$h1,12); + sllg ("%r1","%r1",14); + algr ($h0,"%r0"); + alcgr ($h1,"%r1"); + sllg ("%r0",$h2,40); + srlg ($h2,$h2,24); + lghi ("%r1",0); + algr ($h1,"%r0"); + alcgr ($h2,"%r1"); + + llgf ("%r0","24($ctx)"); # is_base2_26 + lcgr ("%r0","%r0"); + + xgr ($h0,$d0); # choose between radixes + xgr ($h1,$d1); + xgr ($h2,$d2); + ngr ($h0,"%r0"); + ngr ($h1,"%r0"); + ngr ($h2,"%r0"); + xgr ($h0,$d0); + xgr ($h1,$d1); + xgr ($h2,$d2); + + lhi ("%r0",0); + st ("%r0","24($ctx)"); # clear is_base2_26 + + j (".Lpoly1305_blocks_entry"); +SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx"); + +TYPE ("__poly1305_mul","\@function"); +ALIGN (16); +LABEL ("__poly1305_mul"); + vmlof ($ACC0,$H0,$R0); + vmlof ($ACC1,$H0,$R1); + vmlof ($ACC2,$H0,$R2); + vmlof ($ACC3,$H0,$R3); + vmlof ($ACC4,$H0,$R4); + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # lazy reduction + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + br ("%r14"); +SIZE ("__poly1305_mul",".-__poly1305_mul"); + +TYPE ("__poly1305_blocks_vx","\@function"); +ALIGN (16); +LABEL ("__poly1305_blocks_vx"); +&{$z? \&lgr:\&lr} ("%r0",$sp); +&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)"); +if (!$z) { + std ("%f4","16*$SIZE_T+2*8($sp)"); + std ("%f6","16*$SIZE_T+3*8($sp)"); + ahi ($sp,-$stdframe); + st ("%r0","0($sp)"); # back-chain + + llgfr ($len,$len); # so that srlg works on $len +} else { + aghi ($sp,"-($stdframe+8*8)"); + stg ("%r0","0($sp)"); # back-chain + + std ("%f8","$stdframe+0*8($sp)"); + std ("%f9","$stdframe+1*8($sp)"); + std ("%f10","$stdframe+2*8($sp)"); + std ("%f11","$stdframe+3*8($sp)"); + std ("%f12","$stdframe+4*8($sp)"); + std ("%f13","$stdframe+5*8($sp)"); + std ("%f14","$stdframe+6*8($sp)"); + std ("%f15","$stdframe+7*8($sp)"); +} + larl ("%r1",".Lconst"); + vgmg ($mask26,38,63); + vlm ($bswaplo,$bswapmi,"16(%r1)"); + + < ("%r0","24($ctx)"); # is_base2_26? + jnz (".Lskip_init"); + + lg ($h0,"32($ctx)"); # load key base 2^64 + lg ($h1,"40($ctx)"); + + risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 + srlg ($d1,$h0,52); + risbg ($h0,$h0,38,0x80+63,0); + vlvgg ($R0,$h0,0); + risbg ($d1,$h1,38,51,12); + vlvgg ($R1,$d0,0); + risbg ($d0,$h1,38,63,50); + vlvgg ($R2,$d1,0); + srlg ($d1,$h1,40); + vlvgg ($R3,$d0,0); + vlvgg ($R4,$d1,0); + + veslg ($S1,$R1,2); + veslg ($S2,$R2,2); + veslg ($S3,$R3,2); + veslg ($S4,$R4,2); + vlr ($H0,$R0); + vlr ($H1,$R1); + vlr ($H2,$R2); + vlr ($H3,$R3); + vlr ($H4,$R4); + vag ($S1,$S1,$R1); # * 5 + vag ($S2,$S2,$R2); + vag ($S3,$S3,$R3); + vag ($S4,$S4,$R4); + + brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:- + + vpdi ($R0,$H0,$R0,0); # r^2:r^1 + vpdi ($R1,$H1,$R1,0); + vpdi ($R2,$H2,$R2,0); + vpdi ($R3,$H3,$R3,0); + vpdi ($R4,$H4,$R4,0); + vpdi ($H0,$H0,$H0,0); # r^2:r^2 + vpdi ($H1,$H1,$H1,0); + vpdi ($H2,$H2,$H2,0); + vpdi ($H3,$H3,$H3,0); + vpdi ($H4,$H4,$H4,0); + veslg ($S1,$R1,2); + veslg ($S2,$R2,2); + veslg ($S3,$R3,2); + veslg ($S4,$R4,2); + vag ($S1,$S1,$R1); # * 5 + vag ($S2,$S2,$R2); + vag ($S3,$S3,$R3); + vag ($S4,$S4,$R4); + + brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1 + + vl ($I0,"0(%r1)"); # borrow $I0 + vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3 + vperm ($R1,$R1,$H1,$I0); + vperm ($R2,$R2,$H2,$I0); + vperm ($R3,$R3,$H3,$I0); + vperm ($R4,$R4,$H4,$I0); + veslf ($S1,$R1,2); + veslf ($S2,$R2,2); + veslf ($S3,$R3,2); + veslf ($S4,$R4,2); + vaf ($S1,$S1,$R1); # * 5 + vaf ($S2,$S2,$R2); + vaf ($S3,$S3,$R3); + vaf ($S4,$S4,$R4); + + lg ($h0,"0($ctx)"); # load hash base 2^64 + lg ($h1,"8($ctx)"); + lg ($h2,"16($ctx)"); + + vzero ($H0); + vzero ($H1); + vzero ($H2); + vzero ($H3); + vzero ($H4); + + risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26 + srlg ($d1,$h0,52); + risbg ($h0,$h0,38,0x80+63,0); + vlvgg ($H0,$h0,0); + risbg ($d1,$h1,38,51,12); + vlvgg ($H1,$d0,0); + risbg ($d0,$h1,38,63,50); + vlvgg ($H2,$d1,0); + srlg ($d1,$h1,40); + vlvgg ($H3,$d0,0); + risbg ($d1,$h2,37,39,24); + vlvgg ($H4,$d1,0); + + lhi ("%r0",1); + st ("%r0","24($ctx)"); # set is_base2_26 + + vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26 + + vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4 + vpdi ($R1,$R1,$R1,0); + vpdi ($S1,$S1,$S1,0); + vpdi ($R2,$R2,$R2,0); + vpdi ($S2,$S2,$S2,0); + vpdi ($R3,$R3,$R3,0); + vpdi ($S3,$S3,$S3,0); + vpdi ($R4,$R4,$R4,0); + vpdi ($S4,$S4,$S4,0); + + j (".Loaded_hash"); + +ALIGN (16); +LABEL (".Lskip_init"); + vllezf ($H0,"0($ctx)"); # load hash base 2^26 + vllezf ($H1,"4($ctx)"); + vllezf ($H2,"8($ctx)"); + vllezf ($H3,"12($ctx)"); + vllezf ($H4,"16($ctx)"); + + vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4 + vlrepg ($R1,"0x40($ctx)"); + vlrepg ($S1,"0x50($ctx)"); + vlrepg ($R2,"0x60($ctx)"); + vlrepg ($S2,"0x70($ctx)"); + vlrepg ($R3,"0x80($ctx)"); + vlrepg ($S3,"0x90($ctx)"); + vlrepg ($R4,"0xa0($ctx)"); + vlrepg ($S4,"0xb0($ctx)"); + +LABEL (".Loaded_hash"); + vzero ($I1); + vzero ($I3); + + vlm ($T1,$T4,"0x00($inp)"); # load first input block + la ($inp,"0x40($inp)"); + vgmg ($mask26,6,31); + vgmf ($I4,5,5); # padbit<<2 + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + srlg ("%r0",$len,6); +&{$z? \&aghi:\&ahi} ("%r0",-1); + +ALIGN (16); +LABEL (".Loop_vx"); + vmlef ($ACC0,$I0,$R0); + vmlef ($ACC1,$I0,$R1); + vmlef ($ACC2,$I0,$R2); + vmlef ($ACC3,$I0,$R3); + vmlef ($ACC4,$I0,$R4); + + vmalef ($ACC0,$I1,$S4,$ACC0); + vmalef ($ACC1,$I1,$R0,$ACC1); + vmalef ($ACC2,$I1,$R1,$ACC2); + vmalef ($ACC3,$I1,$R2,$ACC3); + vmalef ($ACC4,$I1,$R3,$ACC4); + + vaf ($H2,$H2,$I2); + vaf ($H0,$H0,$I0); + vaf ($H3,$H3,$I3); + vaf ($H1,$H1,$I1); + vaf ($H4,$H4,$I4); + + vmalef ($ACC0,$I2,$S3,$ACC0); + vmalef ($ACC1,$I2,$S4,$ACC1); + vmalef ($ACC2,$I2,$R0,$ACC2); + vmalef ($ACC3,$I2,$R1,$ACC3); + vmalef ($ACC4,$I2,$R2,$ACC4); + + vlm ($T1,$T4,"0x00($inp)"); # load next input block + la ($inp,"0x40($inp)"); + vgmg ($mask26,6,31); + + vmalef ($ACC0,$I3,$S2,$ACC0); + vmalef ($ACC1,$I3,$S3,$ACC1); + vmalef ($ACC2,$I3,$S4,$ACC2); + vmalef ($ACC3,$I3,$R0,$ACC3); + vmalef ($ACC4,$I3,$R1,$ACC4); + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + vmalef ($ACC0,$I4,$S1,$ACC0); + vmalef ($ACC1,$I4,$S2,$ACC1); + vmalef ($ACC2,$I4,$S3,$ACC2); + vmalef ($ACC3,$I4,$S4,$ACC3); + vmalef ($ACC4,$I4,$R0,$ACC4); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + + vmalof ($ACC0,$H0,$R0,$ACC0); + vmalof ($ACC1,$H0,$R1,$ACC1); + vmalof ($ACC2,$H0,$R2,$ACC2); + vmalof ($ACC3,$H0,$R3,$ACC3); + vmalof ($ACC4,$H0,$R4,$ACC4); + + vgmf ($I4,5,5); # padbit<<2 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + +&{$z? \&brctg:\&brct} ("%r0",".Loop_vx"); + + vlm ($R0,$S4,"48($ctx)"); # load all powers + + lghi ("%r0",0x30); +&{$z? \&lcgr:\&lcr} ($len,$len); +&{$z? \&ngr:\&nr} ($len,"%r0"); +&{$z? \&slgr:\&slr} ($inp,$len); + +LABEL (".Last"); + vmlef ($ACC0,$I0,$R0); + vmlef ($ACC1,$I0,$R1); + vmlef ($ACC2,$I0,$R2); + vmlef ($ACC3,$I0,$R3); + vmlef ($ACC4,$I0,$R4); + + vmalef ($ACC0,$I1,$S4,$ACC0); + vmalef ($ACC1,$I1,$R0,$ACC1); + vmalef ($ACC2,$I1,$R1,$ACC2); + vmalef ($ACC3,$I1,$R2,$ACC3); + vmalef ($ACC4,$I1,$R3,$ACC4); + + vaf ($H0,$H0,$I0); + vaf ($H1,$H1,$I1); + vaf ($H2,$H2,$I2); + vaf ($H3,$H3,$I3); + vaf ($H4,$H4,$I4); + + vmalef ($ACC0,$I2,$S3,$ACC0); + vmalef ($ACC1,$I2,$S4,$ACC1); + vmalef ($ACC2,$I2,$R0,$ACC2); + vmalef ($ACC3,$I2,$R1,$ACC3); + vmalef ($ACC4,$I2,$R2,$ACC4); + + vmalef ($ACC0,$I3,$S2,$ACC0); + vmalef ($ACC1,$I3,$S3,$ACC1); + vmalef ($ACC2,$I3,$S4,$ACC2); + vmalef ($ACC3,$I3,$R0,$ACC3); + vmalef ($ACC4,$I3,$R1,$ACC4); + + vmalef ($ACC0,$I4,$S1,$ACC0); + vmalef ($ACC1,$I4,$S2,$ACC1); + vmalef ($ACC2,$I4,$S3,$ACC2); + vmalef ($ACC3,$I4,$S4,$ACC3); + vmalef ($ACC4,$I4,$R0,$ACC4); + + vmalof ($ACC0,$H0,$R0,$ACC0); + vmalof ($ACC1,$H0,$R1,$ACC1); + vmalof ($ACC2,$H0,$R2,$ACC2); + vmalof ($ACC3,$H0,$R3,$ACC3); + vmalof ($ACC4,$H0,$R4,$ACC4); + + vmalof ($ACC0,$H1,$S4,$ACC0); + vmalof ($ACC1,$H1,$R0,$ACC1); + vmalof ($ACC2,$H1,$R1,$ACC2); + vmalof ($ACC3,$H1,$R2,$ACC3); + vmalof ($ACC4,$H1,$R3,$ACC4); + + vmalof ($ACC0,$H2,$S3,$ACC0); + vmalof ($ACC1,$H2,$S4,$ACC1); + vmalof ($ACC2,$H2,$R0,$ACC2); + vmalof ($ACC3,$H2,$R1,$ACC3); + vmalof ($ACC4,$H2,$R2,$ACC4); + + vmalof ($ACC0,$H3,$S2,$ACC0); + vmalof ($ACC1,$H3,$S3,$ACC1); + vmalof ($ACC2,$H3,$S4,$ACC2); + vmalof ($ACC3,$H3,$R0,$ACC3); + vmalof ($ACC4,$H3,$R1,$ACC4); + + vmalof ($ACC0,$H4,$S1,$ACC0); + vmalof ($ACC1,$H4,$S2,$ACC1); + vmalof ($ACC2,$H4,$S3,$ACC2); + vmalof ($ACC3,$H4,$S4,$ACC3); + vmalof ($ACC4,$H4,$R0,$ACC4); + + ################################################################ + # horizontal addition + + vzero ($H0); + vsumqg ($ACC0,$ACC0,$H0); + vsumqg ($ACC1,$ACC1,$H0); + vsumqg ($ACC2,$ACC2,$H0); + vsumqg ($ACC3,$ACC3,$H0); + vsumqg ($ACC4,$ACC4,$H0); + + ################################################################ + # lazy reduction + + vesrlg ($H4,$ACC3,26); + vesrlg ($H1,$ACC0,26); + vn ($H3,$ACC3,$mask26); + vn ($H0,$ACC0,$mask26); + vag ($H4,$H4,$ACC4); # h3 -> h4 + vag ($H1,$H1,$ACC1); # h0 -> h1 + + vesrlg ($ACC4,$H4,26); + vesrlg ($ACC1,$H1,26); + vn ($H4,$H4,$mask26); + vn ($H1,$H1,$mask26); + vag ($H0,$H0,$ACC4); + vag ($H2,$ACC2,$ACC1); # h1 -> h2 + + veslg ($ACC4,$ACC4,2); # <<2 + vesrlg ($ACC2,$H2,26); + vn ($H2,$H2,$mask26); + vag ($H0,$H0,$ACC4); # h4 -> h0 + vag ($H3,$H3,$ACC2); # h2 -> h3 + + vesrlg ($ACC0,$H0,26); + vesrlg ($ACC3,$H3,26); + vn ($H0,$H0,$mask26); + vn ($H3,$H3,$mask26); + vag ($H1,$H1,$ACC0); # h0 -> h1 + vag ($H4,$H4,$ACC3); # h3 -> h4 + +&{$z? \&clgfi:\&clfi} ($len,0); + je (".Ldone"); + + vlm ($T1,$T4,"0x00($inp)"); # load last partial block + vgmg ($mask26,6,31); + vgmf ($I4,5,5); # padbit<<2 + + vperm ($I0,$T3,$T4,$bswaplo); + vperm ($I2,$T3,$T4,$bswapmi); + vperm ($T3,$T3,$T4,$bswaphi); + + vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1 + vl ($ACC1,"0x60($len,%r1)"); + + verimg ($I1,$I0,$mask26,6); # >>26 + veslg ($I0,$I0,32); + veslg ($I2,$I2,28); # >>4 + verimg ($I3,$T3,$mask26,18); # >>14 + verimg ($I4,$T3,$mask26,58); # >>38 + vn ($I0,$I0,$mask26); + vn ($I2,$I2,$mask26); + vesrlf ($I4,$I4,2); # >>2 + + vgmg ($mask26,38,63); + vperm ($T3,$T1,$T2,$bswaplo); + vperm ($T4,$T1,$T2,$bswaphi); + vperm ($T2,$T1,$T2,$bswapmi); + + verimg ($I0,$T3,$mask26,0); + verimg ($I1,$T3,$mask26,38); # >>26 + verimg ($I2,$T2,$mask26,60); # >>4 + verimg ($I3,$T4,$mask26,50); # >>14 + vesrlg ($T4,$T4,40); + vo ($I4,$I4,$T4); + + vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane + vn ($I0,$I0,$ACC1); # mask redundant lane[s] + vperm ($H1,$H1,$H1,$ACC0); + vn ($I1,$I1,$ACC1); + vperm ($H2,$H2,$H2,$ACC0); + vn ($I2,$I2,$ACC1); + vperm ($H3,$H3,$H3,$ACC0); + vn ($I3,$I3,$ACC1); + vperm ($H4,$H4,$H4,$ACC0); + vn ($I4,$I4,$ACC1); + + vaf ($I0,$I0,$H0); # accumulate hash + vzero ($H0); # wipe hash value + vaf ($I1,$I1,$H1); + vzero ($H1); + vaf ($I2,$I2,$H2); + vzero ($H2); + vaf ($I3,$I3,$H3); + vzero ($H3); + vaf ($I4,$I4,$H4); + vzero ($H4); + +&{$z? \&lghi:\&lhi} ($len,0); + j (".Last"); + # I don't bother to tell apart cases when only one multiplication + # pass is sufficient, because I argue that mispredicted branch + # penalties are comparable to overhead of sometimes redundant + # multiplication pass... + +LABEL (".Ldone"); + vstef ($H0,"0($ctx)",3); # store hash base 2^26 + vstef ($H1,"4($ctx)",3); + vstef ($H2,"8($ctx)",3); + vstef ($H3,"12($ctx)",3); + vstef ($H4,"16($ctx)",3); + +if ($z) { + ld ("%f8","$stdframe+0*8($sp)"); + ld ("%f9","$stdframe+1*8($sp)"); + ld ("%f10","$stdframe+2*8($sp)"); + ld ("%f11","$stdframe+3*8($sp)"); + ld ("%f12","$stdframe+4*8($sp)"); + ld ("%f13","$stdframe+5*8($sp)"); + ld ("%f14","$stdframe+6*8($sp)"); + ld ("%f15","$stdframe+7*8($sp)"); +&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)"); +} else { + ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)"); + ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)"); +&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)"); +} + br ("%r14"); +SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx"); } + +################ +# static void poly1305_emit(void *ctx, unsigned char mac[16], +# const u32 nonce[4]) { my ($mac,$nonce)=($inp,$len); -my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9)); - -$code.=<<___; -.globl poly1305_emit -.type poly1305_emit,\@function -.align 16 -poly1305_emit: - stm${g} %r6,%r9,`6*$SIZE_T`($sp) - - lg $h0,0($ctx) - lg $h1,8($ctx) - lg $h2,16($ctx) - - lghi %r0,5 - lghi %r1,0 - lgr $d0,$h0 - lgr $d1,$h1 - - algr $h0,%r0 # compare to modulus - alcgr $h1,%r1 - alcgr $h2,%r1 - - srlg $h2,$h2,2 # did it borrow/carry? - slgr %r1,$h2 # 0-$h2>>2 - lg $h2,0($nonce) # load nonce - lghi %r0,-1 - lg $ctx,8($nonce) - xgr %r0,%r1 # ~%r1 - - ngr $h0,%r1 - ngr $d0,%r0 - ngr $h1,%r1 - ngr $d1,%r0 - ogr $h0,$d0 - rllg $d0,$h2,32 # flip nonce words - ogr $h1,$d1 - rllg $d1,$ctx,32 - - algr $h0,$d0 # accumulate nonce - alcgr $h1,$d1 - - strvg $h0,0($mac) # write little-endian result - strvg $h1,8($mac) - - lm${g} %r6,%r9,`6*$SIZE_T`($sp) - br %r14 -.size poly1305_emit,.-poly1305_emit - -.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>" -___ +my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10)); + +GLOBL ("poly1305_emit"); +TYPE ("poly1305_emit","\@function"); +ALIGN (16); +LABEL ("poly1305_emit"); +LABEL (".Lpoly1305_emit"); +&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)"); + + lg ($d0,"0($ctx)"); + lg ($d1,"8($ctx)"); + lg ($d2,"16($ctx)"); + + llgfr ("%r0",$d0); # base 2^26 -> base 2^64 + srlg ($h0,$d0,32); + llgfr ("%r1",$d1); + srlg ($h1,$d1,32); + srlg ($h2,$d2,32); + + sllg ("%r0","%r0",26); + algr ($h0,"%r0"); + sllg ("%r0",$h1,52); + srlg ($h1,$h1,12); + sllg ("%r1","%r1",14); + algr ($h0,"%r0"); + alcgr ($h1,"%r1"); + sllg ("%r0",$h2,40); + srlg ($h2,$h2,24); + lghi ("%r1",0); + algr ($h1,"%r0"); + alcgr ($h2,"%r1"); + + llgf ("%r0","24($ctx)"); # is_base2_26 + lcgr ("%r0","%r0"); + + xgr ($h0,$d0); # choose between radixes + xgr ($h1,$d1); + xgr ($h2,$d2); + ngr ($h0,"%r0"); + ngr ($h1,"%r0"); + ngr ($h2,"%r0"); + xgr ($h0,$d0); + xgr ($h1,$d1); + xgr ($h2,$d2); + + lghi ("%r0",5); + lgr ($d0,$h0); + lgr ($d1,$h1); + + algr ($h0,"%r0"); # compare to modulus + alcgr ($h1,"%r1"); + alcgr ($h2,"%r1"); + + srlg ($h2,$h2,2); # did it borrow/carry? + slgr ("%r1",$h2); # 0-$h2>>2 + lg ($d2,"0($nonce)"); # load nonce + lg ($ctx,"8($nonce)"); + + xgr ($h0,$d0); + xgr ($h1,$d1); + ngr ($h0,"%r1"); + ngr ($h1,"%r1"); + xgr ($h0,$d0); + rllg ($d0,$d2,32); # flip nonce words + xgr ($h1,$d1); + rllg ($d1,$ctx,32); + + algr ($h0,$d0); # accumulate nonce + alcgr ($h1,$d1); + + strvg ($h0,"0($mac)"); # write little-endian result + strvg ($h1,"8($mac)"); + +&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)"); + br ("%r14"); +SIZE ("poly1305_emit",".-poly1305_emit"); } -$code =~ s/\`([^\`]*)\`/eval $1/gem; -$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm; +################ + +ALIGN (16); +LABEL (".Lconst"); +LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd +LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks +LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918); +LONG (0x00000000,0x09080706,0x00000000,0x19181716); + +LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks +LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000); +LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000); + +LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff); +LONG (0xffffffff,0x00000000,0xffffffff,0x00000000); +LONG (0x00000000,0x00000000,0xffffffff,0x00000000); + +STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\""); -print $code; -close STDOUT or die "error closing STDOUT: $!"; +PERLASM_END(); diff --git a/crypto/poly1305/asm/poly1305-sparcv9.pl b/crypto/poly1305/asm/poly1305-sparcv9.pl index 997e0d8344c6..dc592a07acac 100755 --- a/crypto/poly1305/asm/poly1305-sparcv9.pl +++ b/crypto/poly1305/asm/poly1305-sparcv9.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -41,19 +41,21 @@ # (***) Multi-process benchmark saturates at ~12.5x single-process # result on 8-core processor, or ~21GBps per 2.85GHz socket. -my $output = pop; -open STDOUT,">$output"; +# $output is the last argument if it looks like a file (it has an extension) +my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; + +open STDOUT,">$output" if $output; my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5)); my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7)); my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7)); my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4)); -my $output = pop; -open STDOUT,">$stdout"; - $code.=<<___; -#include "sparc_arch.h" +#ifndef __ASSEMBLER__ +# define __ASSEMBLER__ 1 +#endif +#include "crypto/sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl index 2ae16a230b66..c91d01fb3ba4 100755 --- a/crypto/poly1305/asm/poly1305-x86.pl +++ b/crypto/poly1305/asm/poly1305-x86.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -47,8 +47,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -$output=pop; -open STDOUT,">$output"; +$output=pop and open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index 5f834d8faf2a..4cddca1c514c 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -63,9 +63,10 @@ # (***) strangely enough performance seems to vary from core to core, # listed result is best case; -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -94,7 +95,8 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0 $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); @@ -193,7 +195,7 @@ $code.=<<___ if ($avx>1); bt \$`5+32`,%r9 # AVX2? cmovc %rax,%r10 ___ -$code.=<<___ if ($avx>3); +$code.=<<___ if ($avx>3 && !$win64); mov \$`(1<<31|1<<21|1<<16)`,%rax shr \$32,%r9 and %rax,%r9 @@ -2722,7 +2724,7 @@ $code.=<<___; .cfi_endproc .size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ -if ($avx>3) { +if ($avx>3 && !$win64) { ######################################################################## # VPMADD52 version using 2^44 radix. # @@ -2806,6 +2808,7 @@ $code.=<<___; .align 32 poly1305_blocks_vpmadd52: .cfi_startproc + endbranch shr \$4,$len jz .Lno_data_vpmadd52 # too short @@ -3739,6 +3742,7 @@ $code.=<<___; .align 32 poly1305_emit_base2_44: .cfi_startproc + endbranch mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 mov 16($ctx),%r10 |