aboutsummaryrefslogtreecommitdiff
path: root/crypto/poly1305/asm
diff options
context:
space:
mode:
Diffstat (limited to 'crypto/poly1305/asm')
-rwxr-xr-xcrypto/poly1305/asm/poly1305-armv4.pl53
-rwxr-xr-xcrypto/poly1305/asm/poly1305-armv8.pl69
-rwxr-xr-xcrypto/poly1305/asm/poly1305-c64xplus.pl5
-rw-r--r--crypto/poly1305/asm/poly1305-ia64.S365
-rwxr-xr-xcrypto/poly1305/asm/poly1305-mips.pl10
-rwxr-xr-xcrypto/poly1305/asm/poly1305-ppc.pl1564
-rwxr-xr-xcrypto/poly1305/asm/poly1305-ppcfp.pl10
-rwxr-xr-xcrypto/poly1305/asm/poly1305-s390x.pl1123
-rwxr-xr-xcrypto/poly1305/asm/poly1305-sparcv9.pl18
-rwxr-xr-xcrypto/poly1305/asm/poly1305-x86.pl5
-rwxr-xr-xcrypto/poly1305/asm/poly1305-x86_64.pl20
11 files changed, 2854 insertions, 388 deletions
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl
index 70f46cd140aa..041bfd46e699 100755
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
-# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -28,9 +28,10 @@
# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@@ -38,9 +39,10 @@ if ($flavour && $flavour ne "void") {
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
+ open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
} else {
- open STDOUT,">$output";
+ $output and open STDOUT,">$output";
}
($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
@@ -48,7 +50,6 @@ if ($flavour && $flavour ne "void") {
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
@@ -56,6 +57,8 @@ $code.=<<___;
.code 32
#endif
+.text
+
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
@@ -100,8 +103,10 @@ poly1305_init:
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7
+# if !defined(_WIN32)
ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
#endif
@@ -116,32 +121,22 @@ poly1305_init:
#if __ARM_MAX_ARCH__>=7
tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ adr r12,.Lpoly1305_emit
+ adr r10,.Lpoly1305_emit_neon
+ itt ne
movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
movne r12,r10
+ orr r11,r11,#1 @ thumb-ify address
+ orr r12,r12,#1
# else
-# ifdef __thumb2__
- itete eq
-# endif
addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
-# endif
#endif
ldrb r9,[$inp,#11]
orr r6,r6,r7,lsl#8
@@ -1232,7 +1227,11 @@ poly1305_emit_neon:
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
#endif
___
} }
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
index 2a42b64a929c..dc39f4053fe6 100755
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
-# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -29,20 +29,24 @@
# X-Gene 2.13/+68% 2.27
# Mongoose 1.77/+75% 1.12
# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
# translator is not almighty;
-$flavour=shift;
-$output=shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
@@ -75,18 +79,13 @@ poly1305_init:
csel x0,xzr,x0,eq
b.eq .Lno_key
-#ifdef __ILP32__
- ldrsw $t1,.LOPENSSL_armcap_P
-#else
- ldr $t1,.LOPENSSL_armcap_P
-#endif
- adr $t0,.LOPENSSL_armcap_P
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
- ldr w17,[$t0,$t1]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
#endif
@@ -97,10 +96,10 @@ poly1305_init:
tst w17,#ARMV7_NEON
- adr $d0,poly1305_blocks
- adr $r0,poly1305_blocks_neon
- adr $d1,poly1305_emit
- adr $r1,poly1305_emit_neon
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+ adr $r1,.Lpoly1305_emit_neon
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
@@ -119,6 +118,7 @@ poly1305_init:
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
+.Lpoly1305_blocks:
ands $len,$len,#-16
b.eq .Lno_data
@@ -132,7 +132,7 @@ poly1305_blocks:
.Loop:
ldp $t0,$t1,[$inp],#16 // load input
sub $len,$len,#16
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $t0,$t0
rev $t1,$t1
#endif
@@ -183,6 +183,7 @@ poly1305_blocks:
.type poly1305_emit,%function
.align 5
poly1305_emit:
+.Lpoly1305_emit:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldr $h2,[$ctx,#16]
ldp $t0,$t1,[$nonce] // load nonce
@@ -196,13 +197,13 @@ poly1305_emit:
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
@@ -289,10 +290,11 @@ poly1305_splat:
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.hs .Lblocks_neon
- cbz $is_base2_26,poly1305_blocks
+ cbz $is_base2_26,.Lpoly1305_blocks
.Lblocks_neon:
.inst 0xd503233f // paciasp
@@ -333,7 +335,7 @@ poly1305_blocks_neon:
adcs $h1,$h1,xzr
adc $h2,$h2,xzr
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
@@ -379,7 +381,7 @@ poly1305_blocks_neon:
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
@@ -435,7 +437,7 @@ poly1305_blocks_neon:
csel $in2,$zeros,$in2,lo
mov x4,#1
- str x4,[$ctx,#-24] // set is_base2_26
+ stur x4,[$ctx,#-24] // set is_base2_26
sub $ctx,$ctx,#48 // restore original $ctx
b .Ldo_neon
@@ -464,7 +466,7 @@ poly1305_blocks_neon:
lsl $padbit,$padbit,#24
add x15,$ctx,#48
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -500,7 +502,7 @@ poly1305_blocks_neon:
ld1 {$S2,$R3,$S3,$R4},[x15],#64
ld1 {$S4},[x15]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -561,7 +563,7 @@ poly1305_blocks_neon:
umull $ACC1,$IN23_0,${R1}[2]
ldp x9,x13,[$in2],#48
umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -626,7 +628,7 @@ poly1305_blocks_neon:
umlal $ACC4,$IN01_2,${R2}[0]
umlal $ACC1,$IN01_2,${S4}[0]
umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
@@ -872,6 +874,7 @@ poly1305_blocks_neon:
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
+.Lpoly1305_emit_neon:
ldr $is_base2_26,[$ctx,#24]
cbz $is_base2_26,poly1305_emit
@@ -906,13 +909,13 @@ poly1305_emit_neon:
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
@@ -924,12 +927,6 @@ poly1305_emit_neon:
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
diff --git a/crypto/poly1305/asm/poly1305-c64xplus.pl b/crypto/poly1305/asm/poly1305-c64xplus.pl
index 93fef37e605b..2bcdced7f45c 100755
--- a/crypto/poly1305/asm/poly1305-c64xplus.pl
+++ b/crypto/poly1305/asm/poly1305-c64xplus.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -26,8 +26,7 @@
# time dependent on input length. This module on the other hand is free
# from such limitation.
-$output=pop;
-open STDOUT,">$output";
+$output=pop and open STDOUT,">$output";
($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6");
($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN);
diff --git a/crypto/poly1305/asm/poly1305-ia64.S b/crypto/poly1305/asm/poly1305-ia64.S
new file mode 100644
index 000000000000..54d6454f0322
--- /dev/null
+++ b/crypto/poly1305/asm/poly1305-ia64.S
@@ -0,0 +1,365 @@
+// ====================================================================
+// Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
+// project.
+// ====================================================================
+//
+// Poly1305 for Itanium.
+//
+// January 2019
+//
+// Performance was reported to be ~2.1 cycles per byte on Itanium 2.
+// With exception for processors in 95xx family, which have higher
+// floating-point instructions' latencies and deliver ~2.6 cpb.
+// Comparison to compiler-generated code is not exactly fair, because
+// of different radixes. But just for reference, it was observed to be
+// >3x faster. Originally it was argued that floating-point base 2^32
+// implementation would be optimal. Upon closer look estimate for below
+// integer base 2^64 implementation turned to be approximately same on
+// Itanium 2. But floating-point code would be larger, and have higher
+// overhead, which would negatively affect small-block performance...
+
+#if defined(_HPUX_SOURCE)
+# if !defined(_LP64)
+# define ADDP addp4
+# else
+# define ADDP add
+# endif
+# define RUM rum
+# define SUM sum
+#else
+# define ADDP add
+# define RUM nop
+# define SUM nop
+#endif
+
+.text
+.explicit
+
+.global poly1305_init#
+.proc poly1305_init#
+.align 64
+poly1305_init:
+ .prologue
+ .save ar.pfs,r2
+{ .mmi; alloc r2=ar.pfs,2,0,0,0
+ cmp.eq p6,p7=0,r33 } // key == NULL?
+{ .mmi; ADDP r9=8,r32
+ ADDP r10=16,r32
+ ADDP r32=0,r32 };;
+ .body
+{ .mmi; st8 [r32]=r0,24 // ctx->h0 = 0
+ st8 [r9]=r0 // ctx->h1 = 0
+(p7) ADDP r8=0,r33 }
+{ .mib; st8 [r10]=r0 // ctx->h2 = 0
+(p6) mov r8=0
+(p6) br.ret.spnt b0 };;
+
+{ .mmi; ADDP r9=1,r33
+ ADDP r10=2,r33
+ ADDP r11=3,r33 };;
+{ .mmi; ld1 r16=[r8],4 // load key, little-endian
+ ld1 r17=[r9],4 }
+{ .mmi; ld1 r18=[r10],4
+ ld1 r19=[r11],4 };;
+{ .mmi; ld1 r20=[r8],4
+ ld1 r21=[r9],4 }
+{ .mmi; ld1 r22=[r10],4
+ ld1 r23=[r11],4
+ and r19=15,r19 };;
+{ .mmi; ld1 r24=[r8],4
+ ld1 r25=[r9],4
+ and r20=-4,r20 }
+{ .mmi; ld1 r26=[r10],4
+ ld1 r27=[r11],4
+ and r23=15,r23 };;
+{ .mmi; ld1 r28=[r8],4
+ ld1 r29=[r9],4
+ and r24=-4,r24 }
+{ .mmi; ld1 r30=[r10],4
+ ld1 r31=[r11],4
+ and r27=15,r27 };;
+
+{ .mii; and r28=-4,r28
+ dep r16=r17,r16,8,8
+ dep r18=r19,r18,8,8 };;
+{ .mii; and r31=15,r31
+ dep r16=r18,r16,16,16
+ dep r20=r21,r20,8,8 };;
+{ .mii; dep r16=r20,r16,32,16
+ dep r22=r23,r22,8,8 };;
+{ .mii; dep r16=r22,r16,48,16
+ dep r24=r25,r24,8,8 };;
+{ .mii; dep r26=r27,r26,8,8
+ dep r28=r29,r28,8,8 };;
+{ .mii; dep r24=r26,r24,16,16
+ dep r30=r31,r30,8,8 };;
+{ .mii; st8 [r32]=r16,8 // ctx->r0
+ dep r24=r28,r24,32,16;;
+ dep r24=r30,r24,48,16 };;
+{ .mii; st8 [r32]=r24,8 // ctx->r1
+ shr.u r25=r24,2;;
+ add r25=r25,r24 };;
+{ .mib; st8 [r32]=r25 // ctx->s1
+ mov r8=0
+ br.ret.sptk b0 };;
+.endp poly1305_init#
+
+h0=r17; h1=r18; h2=r19;
+i0=r20; i1=r21;
+HF0=f8; HF1=f9; HF2=f10;
+RF0=f11; RF1=f12; SF1=f13;
+
+.global poly1305_blocks#
+.proc poly1305_blocks#
+.align 64
+poly1305_blocks:
+ .prologue
+ .save ar.pfs,r2
+{ .mii; alloc r2=ar.pfs,4,1,0,0
+ .save ar.lc,r3
+ mov r3=ar.lc
+ .save pr,r36
+ mov r36=pr }
+
+ .body
+{ .mmi; ADDP r8=0,r32
+ ADDP r9=8,r32
+ and r29=7,r33 };;
+{ .mmi; ld8 h0=[r8],16
+ ld8 h1=[r9],16
+ and r33=-8,r33 };;
+{ .mmi; ld8 h2=[r8],16
+ ldf8 RF0=[r9],16
+ shr.u r34=r34,4 };;
+{ .mmi; ldf8 RF1=[r8],-32
+ ldf8 SF1=[r9],-32
+ cmp.ltu p16,p17=1,r34 };;
+{ .mmi;
+(p16) add r34=-2,r34
+(p17) mov r34=0
+ ADDP r10=0,r33 }
+{ .mii; ADDP r11=8,r33
+(p16) mov ar.ec=2
+(p17) mov ar.ec=1 };;
+{ .mib; RUM 1<<1 // go little-endian
+ mov ar.lc=r34
+ brp.loop.imp .Loop,.Lcend-16 }
+
+{ .mmi; cmp.eq p8,p7=0,r29
+ cmp.eq p9,p0=1,r29
+ cmp.eq p10,p0=2,r29 }
+{ .mmi; cmp.eq p11,p0=3,r29
+ cmp.eq p12,p0=4,r29
+ cmp.eq p13,p0=5,r29 }
+{ .mmi; cmp.eq p14,p0=6,r29
+ cmp.eq p15,p0=7,r29
+ add r16=16,r10 };;
+
+{ .mmb;
+(p8) ld8 i0=[r10],16 // aligned input
+(p8) ld8 i1=[r11],16
+(p8) br.cond.sptk .Loop };;
+
+ // align first block
+ .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15
+{ .mmi; (p7) ld8 r14=[r10],24
+ (p7) ld8 r15=[r11],24 }
+
+{ .mii; (p7) ld8 r16=[r16]
+ nop.i 0;;
+ (p15) shrp i0=r15,r14,56 }
+{ .mii; (p15) shrp i1=r16,r15,56
+ (p14) shrp i0=r15,r14,48 }
+{ .mii; (p14) shrp i1=r16,r15,48
+ (p13) shrp i0=r15,r14,40 }
+{ .mii; (p13) shrp i1=r16,r15,40
+ (p12) shrp i0=r15,r14,32 }
+{ .mii; (p12) shrp i1=r16,r15,32
+ (p11) shrp i0=r15,r14,24 }
+{ .mii; (p11) shrp i1=r16,r15,24
+ (p10) shrp i0=r15,r14,16 }
+{ .mii; (p10) shrp i1=r16,r15,16
+ (p9) shrp i0=r15,r14,8 }
+{ .mii; (p9) shrp i1=r16,r15,8
+ mov r14=r16 };;
+
+.Loop:
+ .pred.rel "mutex",p8,p9,p10,p11,p12,p13,p14,p15
+{ .mmi; add h0=h0,i0
+ add h1=h1,i1
+ add h2=h2,r35 };;
+{ .mmi; setf.sig HF0=h0
+ cmp.ltu p6,p0=h0,i0
+ cmp.ltu p7,p0=h1,i1 };;
+{ .mmi; (p6) add h1=1,h1;;
+ setf.sig HF1=h1
+ (p6) cmp.eq.or p7,p0=0,h1 };;
+{ .mmi; (p7) add h2=1,h2;;
+ setf.sig HF2=h2 };;
+
+{ .mfi; (p16) ld8 r15=[r10],16
+ xmpy.lu f32=HF0,RF0 }
+{ .mfi; (p16) ld8 r16=[r11],16
+ xmpy.hu f33=HF0,RF0 }
+{ .mfi; xmpy.lu f36=HF0,RF1 }
+{ .mfi; xmpy.hu f37=HF0,RF1 };;
+{ .mfi; xmpy.lu f34=HF1,SF1
+ (p15) shrp i0=r15,r14,56 }
+{ .mfi; xmpy.hu f35=HF1,SF1 }
+{ .mfi; xmpy.lu f38=HF1,RF0
+ (p15) shrp i1=r16,r15,56 }
+{ .mfi; xmpy.hu f39=HF1,RF0 }
+{ .mfi; xmpy.lu f40=HF2,SF1
+ (p14) shrp i0=r15,r14,48 }
+{ .mfi; xmpy.lu f41=HF2,RF0 };;
+
+{ .mmi; getf.sig r22=f32
+ getf.sig r23=f33
+ (p14) shrp i1=r16,r15,48 }
+{ .mmi; getf.sig r24=f34
+ getf.sig r25=f35
+ (p13) shrp i0=r15,r14,40 }
+{ .mmi; getf.sig r26=f36
+ getf.sig r27=f37
+ (p13) shrp i1=r16,r15,40 }
+{ .mmi; getf.sig r28=f38
+ getf.sig r29=f39
+ (p12) shrp i0=r15,r14,32 }
+{ .mmi; getf.sig r30=f40
+ getf.sig r31=f41 };;
+
+{ .mmi; add h0=r22,r24
+ add r23=r23,r25
+ (p12) shrp i1=r16,r15,32 }
+{ .mmi; add h1=r26,r28
+ add r27=r27,r29
+ (p11) shrp i0=r15,r14,24 };;
+{ .mmi; cmp.ltu p6,p0=h0,r24
+ cmp.ltu p7,p0=h1,r28
+ add r23=r23,r30 };;
+{ .mmi; (p6) add r23=1,r23
+ (p7) add r27=1,r27
+ (p11) shrp i1=r16,r15,24 };;
+{ .mmi; add h1=h1,r23;;
+ cmp.ltu p6,p7=h1,r23
+ (p10) shrp i0=r15,r14,16 };;
+{ .mmi; (p6) add h2=r31,r27,1
+ (p7) add h2=r31,r27
+ (p10) shrp i1=r16,r15,16 };;
+
+{ .mmi; (p8) mov i0=r15
+ and r22=-4,h2
+ shr.u r23=h2,2 };;
+{ .mmi; add r22=r22,r23
+ and h2=3,h2
+ (p9) shrp i0=r15,r14,8 };;
+
+{ .mmi; add h0=h0,r22;;
+ cmp.ltu p6,p0=h0,r22
+ (p9) shrp i1=r16,r15,8 };;
+{ .mmi; (p8) mov i1=r16
+ (p6) cmp.eq.unc p7,p0=-1,h1
+ (p6) add h1=1,h1 };;
+{ .mmb; (p7) add h2=1,h2
+ mov r14=r16
+ br.ctop.sptk .Loop };;
+.Lcend:
+
+{ .mii; SUM 1<<1 // back to big-endian
+ mov ar.lc=r3 };;
+
+{ .mmi; st8 [r8]=h0,16
+ st8 [r9]=h1
+ mov pr=r36,0x1ffff };;
+{ .mmb; st8 [r8]=h2
+ rum 1<<5
+ br.ret.sptk b0 };;
+.endp poly1305_blocks#
+
+.global poly1305_emit#
+.proc poly1305_emit#
+.align 64
+poly1305_emit:
+ .prologue
+ .save ar.pfs,r2
+{ .mmi; alloc r2=ar.pfs,3,0,0,0
+ ADDP r8=0,r32
+ ADDP r9=8,r32 };;
+
+ .body
+{ .mmi; ld8 r16=[r8],16 // load hash
+ ld8 r17=[r9]
+ ADDP r10=0,r34 };;
+{ .mmi; ld8 r18=[r8]
+ ld4 r24=[r10],8 // load nonce
+ ADDP r11=4,r34 };;
+
+{ .mmi; ld4 r25=[r11],8
+ ld4 r26=[r10]
+ add r20=5,r16 };;
+
+{ .mmi; ld4 r27=[r11]
+ cmp.ltu p6,p7=r20,r16
+ shl r25=r25,32 };;
+{ .mmi;
+(p6) add r21=1,r17
+(p7) add r21=0,r17
+(p6) cmp.eq.or.andcm p6,p7=-1,r17 };;
+{ .mmi;
+(p6) add r22=1,r18
+(p7) add r22=0,r18
+ shl r27=r27,32 };;
+{ .mmi; or r24=r24,r25
+ or r26=r26,r27
+ cmp.leu p6,p7=4,r22 };;
+{ .mmi;
+(p6) add r16=r20,r24
+(p7) add r16=r16,r24
+(p6) add r17=r21,r26 };;
+{ .mii;
+(p7) add r17=r17,r26
+ cmp.ltu p6,p7=r16,r24;;
+(p6) add r17=1,r17 };;
+
+{ .mmi; ADDP r8=0,r33
+ ADDP r9=4,r33
+ shr.u r20=r16,32 }
+{ .mmi; ADDP r10=8,r33
+ ADDP r11=12,r33
+ shr.u r21=r17,32 };;
+
+{ .mmi; st1 [r8]=r16,1 // write mac, little-endian
+ st1 [r9]=r20,1
+ shr.u r16=r16,8 }
+{ .mii; st1 [r10]=r17,1
+ shr.u r20=r20,8
+ shr.u r17=r17,8 }
+{ .mmi; st1 [r11]=r21,1
+ shr.u r21=r21,8 };;
+
+{ .mmi; st1 [r8]=r16,1
+ st1 [r9]=r20,1
+ shr.u r16=r16,8 }
+{ .mii; st1 [r10]=r17,1
+ shr.u r20=r20,8
+ shr.u r17=r17,8 }
+{ .mmi; st1 [r11]=r21,1
+ shr.u r21=r21,8 };;
+
+{ .mmi; st1 [r8]=r16,1
+ st1 [r9]=r20,1
+ shr.u r16=r16,8 }
+{ .mii; st1 [r10]=r17,1
+ shr.u r20=r20,8
+ shr.u r17=r17,8 }
+{ .mmi; st1 [r11]=r21,1
+ shr.u r21=r21,8 };;
+
+{ .mmi; st1 [r8]=r16
+ st1 [r9]=r20 }
+{ .mmb; st1 [r10]=r17
+ st1 [r11]=r21
+ br.ret.sptk b0 };;
+.endp poly1305_emit#
+
+stringz "Poly1305 for IA64, CRYPTOGAMS by \@dot-asm"
diff --git a/crypto/poly1305/asm/poly1305-mips.pl b/crypto/poly1305/asm/poly1305-mips.pl
index 965825dc3eda..6c0b3292d07c 100755
--- a/crypto/poly1305/asm/poly1305-mips.pl
+++ b/crypto/poly1305/asm/poly1305-mips.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -56,7 +56,11 @@
#
######################################################################
-$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+# supported flavours are o32,n32,64,nubi32,nubi64, default is o32
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
die "MIPS64 only" unless ($flavour =~ /64|n32/i);
@@ -431,7 +435,7 @@ poly1305_emit:
___
}
-$output=pop and open STDOUT,">$output";
+$output and open STDOUT,">$output";
print $code;
close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl
index e5d6933ac4d5..9f9b27cac336 100755
--- a/crypto/poly1305/asm/poly1305-ppc.pl
+++ b/crypto/poly1305/asm/poly1305-ppc.pl
@@ -1,17 +1,17 @@
#! /usr/bin/env perl
-# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
+# Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
+# project. The module is dual licensed under OpenSSL and CRYPTOGAMS
+# licenses depending on where you obtain it. For further details see
+# https://github.com/dot-asm/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for PowerPC.
@@ -44,8 +44,18 @@
#
# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
+#
+# January 2019
+#
+# ... Unfortunately not:-( Estimate was a projection of ARM result,
+# but ARM has vector multiply-n-add instruction, while PowerISA does
+# not, not one usable in the context. Improvement is ~40% over -m64
+# result above and is ~1.43 on little-endian systems.
-$flavour = shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour =~ /64/) {
$SIZE_T =8;
@@ -72,7 +82,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+open STDOUT,"| $^X $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
$FRAME=24*$SIZE_T;
@@ -99,6 +110,7 @@ $code.=<<___;
std r0,0($ctx) # zero hash value
std r0,8($ctx)
std r0,16($ctx)
+ stw r0,24($ctx) # clear is_base2_26
$UCMP $inp,r0
beq- Lno_key
@@ -140,6 +152,7 @@ Lno_key:
.globl .poly1305_blocks
.align 4
.poly1305_blocks:
+Lpoly1305_blocks:
srdi. $len,$len,4
beq- Labort
@@ -238,60 +251,120 @@ Labort:
.long 0
.byte 0,12,4,1,0x80,5,4,0
.size .poly1305_blocks,.-.poly1305_blocks
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
+$code.=<<___;
.globl .poly1305_emit
-.align 4
+.align 5
.poly1305_emit:
- ld $h0,0($ctx) # load hash
- ld $h1,8($ctx)
- ld $h2,16($ctx)
- ld $padbit,0($nonce) # load nonce
- ld $nonce,8($nonce)
-
- addic $d0,$h0,5 # compare to modulus
- addze $d1,$h1
- addze $d2,$h2
-
- srdi $mask,$d2,2 # did it carry/borrow?
- neg $mask,$mask
+ lwz $h0,0($ctx) # load hash value base 2^26
+ lwz $h1,4($ctx)
+ lwz $h2,8($ctx)
+ lwz $h3,12($ctx)
+ lwz $h4,16($ctx)
+ lwz r0,24($ctx) # is_base2_26
+
+ sldi $h1,$h1,26 # base 2^26 -> base 2^64
+ sldi $t0,$h2,52
+ srdi $h2,$h2,12
+ sldi $h3,$h3,14
+ add $h0,$h0,$h1
+ addc $h0,$h0,$t0
+ sldi $t0,$h4,40
+ srdi $h4,$h4,24
+ adde $h1,$h2,$h3
+ addc $h1,$h1,$t0
+ addze $h2,$h4
+
+ ld $h3,0($ctx) # load hash value base 2^64
+ ld $h4,8($ctx)
+ ld $t0,16($ctx)
+
+ neg r0,r0
+ xor $h0,$h0,$h3 # choose between radixes
+ xor $h1,$h1,$h4
+ xor $h2,$h2,$t0
+ and $h0,$h0,r0
+ and $h1,$h1,r0
+ and $h2,$h2,r0
+ xor $h0,$h0,$h3
+ xor $h1,$h1,$h4
+ xor $h2,$h2,$t0
+
+ addic $h3,$h0,5 # compare to modulus
+ addze $h4,$h1
+ addze $t0,$h2
+
+ srdi $t0,$t0,2 # see if it carried/borrowed
+ neg $t0,$t0
+
+ andc $h0,$h0,$t0
+ and $h3,$h3,$t0
+ andc $h1,$h1,$t0
+ and $h4,$h4,$t0
+ or $h0,$h0,$h3
+ or $h1,$h1,$h4
+
+ lwz $t0,4($nonce)
+ lwz $h2,12($nonce)
+ lwz $h3,0($nonce)
+ lwz $h4,8($nonce)
+
+ insrdi $h3,$t0,32,0
+ insrdi $h4,$h2,32,0
+
+ addc $h0,$h0,$h3 # accumulate nonce
+ adde $h1,$h1,$h4
+
+ addi $ctx,$mac,-1
+ addi $mac,$mac,7
+
+ stbu $h0,1($ctx) # write [little-endian] result
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ srdi $h0,$h0,8
+ stbu $h1,1($mac)
+ srdi $h1,$h1,8
+
+ stbu $h0,1($ctx)
+ stbu $h1,1($mac)
- andc $h0,$h0,$mask
- and $d0,$d0,$mask
- andc $h1,$h1,$mask
- and $d1,$d1,$mask
- or $h0,$h0,$d0
- or $h1,$h1,$d1
-___
-$code.=<<___ if (!$LITTLE_ENDIAN);
- rotldi $padbit,$padbit,32 # flip nonce words
- rotldi $nonce,$nonce,32
-___
-$code.=<<___;
- addc $h0,$h0,$padbit # accumulate nonce
- adde $h1,$h1,$nonce
-___
-$code.=<<___ if ($LITTLE_ENDIAN);
- std $h0,0($mac) # write result
- std $h1,8($mac)
-___
-$code.=<<___ if (!$LITTLE_ENDIAN);
- extrdi r0,$h0,32,0
- li $d0,4
- stwbrx $h0,0,$mac # write result
- extrdi $h0,$h1,32,0
- li $d1,8
- stwbrx r0,$d0,$mac
- li $d2,12
- stwbrx $h1,$d1,$mac
- stwbrx $h0,$d2,$mac
-___
-$code.=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.size .poly1305_emit,.-.poly1305_emit
___
- } else {
+} } else {
###############################################################################
# base 2^32 implementation
@@ -309,6 +382,7 @@ $code.=<<___;
stw r0,8($ctx)
stw r0,12($ctx)
stw r0,16($ctx)
+ stw r0,24($ctx) # clear is_base2_26
$UCMP $inp,r0
beq- Lno_key
@@ -353,6 +427,7 @@ Lno_key:
.globl .poly1305_blocks
.align 4
.poly1305_blocks:
+Lpoly1305_blocks:
srwi. $len,$len,4
beq- Labort
@@ -560,17 +635,389 @@ Labort:
.long 0
.byte 0,12,4,1,0x80,18,4,0
.size .poly1305_blocks,.-.poly1305_blocks
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
+$code.=<<___;
.globl .poly1305_emit
-.align 4
+.align 5
.poly1305_emit:
- $STU $sp,-$FRAME($sp)
+ lwz r0,24($ctx) # is_base2_26
+ lwz $h0,0($ctx) # load hash value
+ lwz $h1,4($ctx)
+ lwz $h2,8($ctx)
+ lwz $h3,12($ctx)
+ lwz $h4,16($ctx)
+ cmplwi r0,0
+ beq Lemit_base2_32
+
+ slwi $t0,$h1,26 # base 2^26 -> base 2^32
+ srwi $h1,$h1,6
+ slwi $t1,$h2,20
+ srwi $h2,$h2,12
+ addc $h0,$h0,$t0
+ slwi $t0,$h3,14
+ srwi $h3,$h3,18
+ adde $h1,$h1,$t1
+ slwi $t1,$h4,8
+ srwi $h4,$h4,24
+ adde $h2,$h2,$t0
+ adde $h3,$h3,$t1
+ addze $h4,$h4
+
+Lemit_base2_32:
+ addic r0,$h0,5 # compare to modulus
+ addze r0,$h1
+ addze r0,$h2
+ addze r0,$h3
+ addze r0,$h4
+
+ srwi r0,r0,2 # see if it carried/borrowed
+ neg r0,r0
+ andi. r0,r0,5
+
+ addc $h0,$h0,r0
+ lwz r0,0($nonce)
+ addze $h1,$h1
+ lwz $t0,4($nonce)
+ addze $h2,$h2
+ lwz $t1,8($nonce)
+ addze $h3,$h3
+ lwz $h4,12($nonce)
+
+ addc $h0,$h0,r0 # accumulate nonce
+ adde $h1,$h1,$t0
+ adde $h2,$h2,$t1
+ adde $h3,$h3,$h4
+
+ addi $ctx,$mac,-1
+ addi $mac,$mac,7
+
+ stbu $h0,1($ctx) # write [little-endian] result
+ srwi $h0,$h0,8
+ stbu $h2,1($mac)
+ srwi $h2,$h2,8
+
+ stbu $h0,1($ctx)
+ srwi $h0,$h0,8
+ stbu $h2,1($mac)
+ srwi $h2,$h2,8
+
+ stbu $h0,1($ctx)
+ srwi $h0,$h0,8
+ stbu $h2,1($mac)
+ srwi $h2,$h2,8
+
+ stbu $h0,1($ctx)
+ stbu $h2,1($mac)
+
+ stbu $h1,1($ctx)
+ srwi $h1,$h1,8
+ stbu $h3,1($mac)
+ srwi $h3,$h3,8
+
+ stbu $h1,1($ctx)
+ srwi $h1,$h1,8
+ stbu $h3,1($mac)
+ srwi $h3,$h3,8
+
+ stbu $h1,1($ctx)
+ srwi $h1,$h1,8
+ stbu $h3,1($mac)
+ srwi $h3,$h3,8
+
+ stbu $h1,1($ctx)
+ stbu $h3,1($mac)
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,3,0
+.size .poly1305_emit,.-.poly1305_emit
+___
+} }
+{{{
+########################################################################
+# PowerISA 2.07/VSX section #
+########################################################################
+
+my $LOCALS= 6*$SIZE_T;
+my $VSXFRAME = $LOCALS + 6*$SIZE_T;
+ $VSXFRAME += 128; # local variables
+ $VSXFRAME += 12*16; # v20-v31 offload
+
+my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
+
+########################################################################
+# Layout of opaque area is following:
+#
+# unsigned __int32 h[5]; # current hash value base 2^26
+# unsigned __int32 pad;
+# unsigned __int32 is_base2_26, pad;
+# unsigned __int64 r[2]; # key value base 2^64
+# struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of powers of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
+# powers is as they appear in register, not memory.
+
+my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
+my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
+my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
+my ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
+my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
+my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
+my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
+my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
+my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
+
+ if ($flavour =~ /64/) {
+###############################################################################
+# setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
+# but the base 2^26 computational part is same...
+
+my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
+my $mask = "r0";
+
+$code.=<<___;
+.globl .poly1305_blocks_vsx
+.align 5
+.poly1305_blocks_vsx:
+ lwz r7,24($ctx) # is_base2_26
+ cmpldi $len,128
+ bge __poly1305_blocks_vsx
+
+ neg r0,r7 # is_base2_26 as mask
+ lwz r7,0($ctx) # load hash base 2^26
+ lwz r8,4($ctx)
+ lwz r9,8($ctx)
+ lwz r10,12($ctx)
+ lwz r11,16($ctx)
+
+ sldi r8,r8,26 # base 2^26 -> base 2^64
+ sldi r12,r9,52
+ add r7,r7,r8
+ srdi r9,r9,12
+ sldi r10,r10,14
+ addc r7,r7,r12
+ sldi r8,r11,40
+ adde r9,r9,r10
+ srdi r11,r11,24
+ addc r9,r9,r8
+ addze r11,r11
+
+ ld r8,0($ctx) # load hash base 2^64
+ ld r10,8($ctx)
+ ld r12,16($ctx)
+
+ xor r7,r7,r8 # select between radixes
+ xor r9,r9,r10
+ xor r11,r11,r12
+ and r7,r7,r0
+ and r9,r9,r0
+ and r11,r11,r0
+ xor r7,r7,r8
+ xor r9,r9,r10
+ xor r11,r11,r12
+
+ li r0,0
+ std r7,0($ctx) # store hash base 2^64
+ std r9,8($ctx)
+ std r11,16($ctx)
+ stw r0,24($ctx) # clear is_base2_26
+
+ b Lpoly1305_blocks
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
+
+.align 5
+__poly1305_mul:
+ mulld $d0,$h0,$r0 # h0*r0
+ mulhdu $d1,$h0,$r0
+
+ mulld $t0,$h1,$s1 # h1*5*r1
+ mulhdu $t1,$h1,$s1
+ addc $d0,$d0,$t0
+ adde $d1,$d1,$t1
+
+ mulld $t0,$h0,$r1 # h0*r1
+ mulhdu $d2,$h0,$r1
+ addc $d1,$d1,$t0
+ addze $d2,$d2
+
+ mulld $t0,$h1,$r0 # h1*r0
+ mulhdu $t1,$h1,$r0
+ addc $d1,$d1,$t0
+ adde $d2,$d2,$t1
+
+ mulld $t0,$h2,$s1 # h2*5*r1
+ mulld $t1,$h2,$r0 # h2*r0
+ addc $d1,$d1,$t0
+ adde $d2,$d2,$t1
+
+ andc $t0,$d2,$mask # final reduction step
+ and $h2,$d2,$mask
+ srdi $t1,$t0,2
+ add $t0,$t0,$t1
+ addc $h0,$d0,$t0
+ addze $h1,$d1
+ addze $h2,$h2
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size __poly1305_mul,.-__poly1305_mul
+
+.align 5
+__poly1305_splat:
+ extrdi $d0,$h0,26,38
+ extrdi $d1,$h0,26,12
+ stw $d0,0x00($t1)
+
+ extrdi $d2,$h0,12,0
+ slwi $d0,$d1,2
+ stw $d1,0x10($t1)
+ add $d0,$d0,$d1 # * 5
+ stw $d0,0x20($t1)
+
+ insrdi $d2,$h1,14,38
+ slwi $d0,$d2,2
+ stw $d2,0x30($t1)
+ add $d0,$d0,$d2 # * 5
+ stw $d0,0x40($t1)
+
+ extrdi $d1,$h1,26,24
+ extrdi $d2,$h1,24,0
+ slwi $d0,$d1,2
+ stw $d1,0x50($t1)
+ add $d0,$d0,$d1 # * 5
+ stw $d0,0x60($t1)
+
+ insrdi $d2,$h2,3,37
+ slwi $d0,$d2,2
+ stw $d2,0x70($t1)
+ add $d0,$d0,$d2 # * 5
+ stw $d0,0x80($t1)
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size __poly1305_splat,.-__poly1305_splat
+
+.align 5
+__poly1305_blocks_vsx:
+ $STU $sp,-$VSXFRAME($sp)
mflr r0
- $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
- $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
- $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
- $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
- $PUSH r0,`$FRAME+$LRSAVE`($sp)
+ li r10,`15+$LOCALS+128`
+ li r11,`31+$LOCALS+128`
+ mfspr r12,256
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
+ li r12,-1
+ mtspr 256,r12 # preserve all AltiVec registers
+ $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
+
+ bl LPICmeup
+
+ li $x10,0x10
+ li $x20,0x20
+ li $x30,0x30
+ li $x40,0x40
+ li $x50,0x50
+ lvx_u $mask26,$x00,$const
+ lvx_u $_26,$x10,$const
+ lvx_u $_40,$x20,$const
+ lvx_u $I2perm,$x30,$const
+ lvx_u $padbits,$x40,$const
+
+ cmplwi r7,0 # is_base2_26?
+ bne Lskip_init_vsx
+
+ ld $r0,32($ctx) # load key base 2^64
+ ld $r1,40($ctx)
+ srdi $s1,$r1,2
+ li $mask,3
+ add $s1,$s1,$r1 # s1 = r1 + r1>>2
+
+ mr $h0,$r0 # "calculate" r^1
+ mr $h1,$r1
+ li $h2,0
+ addi $t1,$ctx,`48+(12^$BIG_ENDIAN)`
+ bl __poly1305_splat
+
+ bl __poly1305_mul # calculate r^2
+ addi $t1,$ctx,`48+(4^$BIG_ENDIAN)`
+ bl __poly1305_splat
+
+ bl __poly1305_mul # calculate r^3
+ addi $t1,$ctx,`48+(8^$BIG_ENDIAN)`
+ bl __poly1305_splat
+
+ bl __poly1305_mul # calculate r^4
+ addi $t1,$ctx,`48+(0^$BIG_ENDIAN)`
+ bl __poly1305_splat
+
+ ld $h0,0($ctx) # load hash
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ extrdi $d0,$h0,26,38 # base 2^64 -> base 2^26
+ extrdi $d1,$h0,26,12
+ extrdi $d2,$h0,12,0
+ mtvrwz $H0,$d0
+ insrdi $d2,$h1,14,38
+ mtvrwz $H1,$d1
+ extrdi $d1,$h1,26,24
+ mtvrwz $H2,$d2
+ extrdi $d2,$h1,24,0
+ mtvrwz $H3,$d1
+ insrdi $d2,$h2,3,37
+ mtvrwz $H4,$d2
+___
+ } else {
+###############################################################################
+# 32-bit initialization
+
+my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
+my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
+
+$code.=<<___;
+.globl .poly1305_blocks_vsx
+.align 5
+.poly1305_blocks_vsx:
+ lwz r7,24($ctx) # is_base2_26
+ cmplwi $len,128
+ bge __poly1305_blocks_vsx
+ cmplwi r7,0
+ beq Lpoly1305_blocks
lwz $h0,0($ctx) # load hash
lwz $h1,4($ctx)
@@ -578,68 +1025,957 @@ Labort:
lwz $h3,12($ctx)
lwz $h4,16($ctx)
- addic $d0,$h0,5 # compare to modulus
- addze $d1,$h1
- addze $d2,$h2
- addze $d3,$h3
- addze $mask,$h4
+ slwi $t0,$h1,26 # base 2^26 -> base 2^32
+ srwi $h1,$h1,6
+ slwi $t1,$h2,20
+ srwi $h2,$h2,12
+ addc $h0,$h0,$t0
+ slwi $t0,$h3,14
+ srwi $h3,$h3,18
+ adde $h1,$h1,$t1
+ slwi $t1,$h4,8
+ srwi $h4,$h4,24
+ adde $h2,$h2,$t0
+ li $t0,0
+ adde $h3,$h3,$t1
+ addze $h4,$h4
- srwi $mask,$mask,2 # did it carry/borrow?
- neg $mask,$mask
+ stw $h0,0($ctx) # store hash base 2^32
+ stw $h1,4($ctx)
+ stw $h2,8($ctx)
+ stw $h3,12($ctx)
+ stw $h4,16($ctx)
+ stw $t0,24($ctx) # clear is_base2_26
- andc $h0,$h0,$mask
- and $d0,$d0,$mask
- andc $h1,$h1,$mask
- and $d1,$d1,$mask
- or $h0,$h0,$d0
- lwz $d0,0($nonce) # load nonce
- andc $h2,$h2,$mask
- and $d2,$d2,$mask
- or $h1,$h1,$d1
- lwz $d1,4($nonce)
- andc $h3,$h3,$mask
- and $d3,$d3,$mask
- or $h2,$h2,$d2
- lwz $d2,8($nonce)
- or $h3,$h3,$d3
- lwz $d3,12($nonce)
-
- addc $h0,$h0,$d0 # accumulate nonce
- adde $h1,$h1,$d1
- adde $h2,$h2,$d2
- adde $h3,$h3,$d3
-___
-$code.=<<___ if ($LITTLE_ENDIAN);
- stw $h0,0($mac) # write result
- stw $h1,4($mac)
- stw $h2,8($mac)
- stw $h3,12($mac)
-___
-$code.=<<___ if (!$LITTLE_ENDIAN);
- li $d1,4
- stwbrx $h0,0,$mac # write result
- li $d2,8
- stwbrx $h1,$d1,$mac
- li $d3,12
- stwbrx $h2,$d2,$mac
- stwbrx $h3,$d3,$mac
+ b Lpoly1305_blocks
+ .long 0
+ .byte 0,12,0x14,0,0,0,4,0
+.size .poly1305_blocks_vsx,.-.poly1305_blocks_vsx
+
+.align 5
+__poly1305_mul:
+ vmulouw $ACC0,$H0,$R0
+ vmulouw $ACC1,$H1,$R0
+ vmulouw $ACC2,$H2,$R0
+ vmulouw $ACC3,$H3,$R0
+ vmulouw $ACC4,$H4,$R0
+
+ vmulouw $T0,$H4,$S1
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H0,$R1
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H1,$R1
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H2,$R1
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H3,$R1
+ vaddudm $ACC4,$ACC4,$T0
+
+ vmulouw $T0,$H3,$S2
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H4,$S2
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H0,$R2
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H1,$R2
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H2,$R2
+ vaddudm $ACC4,$ACC4,$T0
+
+ vmulouw $T0,$H2,$S3
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H3,$S3
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H4,$S3
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H0,$R3
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H1,$R3
+ vaddudm $ACC4,$ACC4,$T0
+
+ vmulouw $T0,$H1,$S4
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H2,$S4
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H3,$S4
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H4,$S4
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H0,$R4
+ vaddudm $ACC4,$ACC4,$T0
+
+ ################################################################
+ # lazy reduction
+
+ vspltisb $T0,2
+ vsrd $H4,$ACC3,$_26
+ vsrd $H1,$ACC0,$_26
+ vand $H3,$ACC3,$mask26
+ vand $H0,$ACC0,$mask26
+ vaddudm $H4,$H4,$ACC4 # h3 -> h4
+ vaddudm $H1,$H1,$ACC1 # h0 -> h1
+
+ vsrd $ACC4,$H4,$_26
+ vsrd $ACC1,$H1,$_26
+ vand $H4,$H4,$mask26
+ vand $H1,$H1,$mask26
+ vaddudm $H0,$H0,$ACC4
+ vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
+
+ vsld $ACC4,$ACC4,$T0 # <<2
+ vsrd $ACC2,$H2,$_26
+ vand $H2,$H2,$mask26
+ vaddudm $H0,$H0,$ACC4 # h4 -> h0
+ vaddudm $H3,$H3,$ACC2 # h2 -> h3
+
+ vsrd $ACC0,$H0,$_26
+ vsrd $ACC3,$H3,$_26
+ vand $H0,$H0,$mask26
+ vand $H3,$H3,$mask26
+ vaddudm $H1,$H1,$ACC0 # h0 -> h1
+ vaddudm $H4,$H4,$ACC3 # h3 -> h4
+
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+.size __poly1305_mul,.-__poly1305_mul
+
+.align 5
+__poly1305_blocks_vsx:
+ $STU $sp,-$VSXFRAME($sp)
+ mflr r0
+ li r10,`15+$LOCALS+128`
+ li r11,`31+$LOCALS+128`
+ mfspr r12,256
+ stvx v20,r10,$sp
+ addi r10,r10,32
+ stvx v21,r11,$sp
+ addi r11,r11,32
+ stvx v22,r10,$sp
+ addi r10,r10,32
+ stvx v23,r11,$sp
+ addi r11,r11,32
+ stvx v24,r10,$sp
+ addi r10,r10,32
+ stvx v25,r11,$sp
+ addi r11,r11,32
+ stvx v26,r10,$sp
+ addi r10,r10,32
+ stvx v27,r11,$sp
+ addi r11,r11,32
+ stvx v28,r10,$sp
+ addi r10,r10,32
+ stvx v29,r11,$sp
+ addi r11,r11,32
+ stvx v30,r10,$sp
+ stvx v31,r11,$sp
+ stw r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
+ li r12,-1
+ mtspr 256,r12 # preserve all AltiVec registers
+ $PUSH r27,`$VSXFRAME-$SIZE_T*5`($sp)
+ $PUSH r28,`$VSXFRAME-$SIZE_T*4`($sp)
+ $PUSH r29,`$VSXFRAME-$SIZE_T*3`($sp)
+ $PUSH r30,`$VSXFRAME-$SIZE_T*2`($sp)
+ $PUSH r31,`$VSXFRAME-$SIZE_T*1`($sp)
+ $PUSH r0,`$VSXFRAME+$LRSAVE`($sp)
+
+ bl LPICmeup
+
+ li $x10,0x10
+ li $x20,0x20
+ li $x30,0x30
+ li $x40,0x40
+ li $x50,0x50
+ lvx_u $mask26,$x00,$const
+ lvx_u $_26,$x10,$const
+ lvx_u $_40,$x20,$const
+ lvx_u $I2perm,$x30,$const
+ lvx_u $padbits,$x40,$const
+
+ cmplwi r7,0 # is_base2_26?
+ bne Lskip_init_vsx
+
+ lwz $h1,32($ctx) # load key base 2^32
+ lwz $h2,36($ctx)
+ lwz $h3,40($ctx)
+ lwz $h4,44($ctx)
+
+ extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
+ extrwi $h1,$h1,6,0
+ insrwi $h1,$h2,20,6
+ extrwi $h2,$h2,12,0
+ insrwi $h2,$h3,14,6
+ extrwi $h3,$h3,18,0
+ insrwi $h3,$h4,8,6
+ extrwi $h4,$h4,24,0
+
+ mtvrwz $R0,$h0
+ slwi $h0,$h1,2
+ mtvrwz $R1,$h1
+ add $h1,$h1,$h0
+ mtvrwz $S1,$h1
+ slwi $h1,$h2,2
+ mtvrwz $R2,$h2
+ add $h2,$h2,$h1
+ mtvrwz $S2,$h2
+ slwi $h2,$h3,2
+ mtvrwz $R3,$h3
+ add $h3,$h3,$h2
+ mtvrwz $S3,$h3
+ slwi $h3,$h4,2
+ mtvrwz $R4,$h4
+ add $h4,$h4,$h3
+ mtvrwz $S4,$h4
+
+ vmr $H0,$R0
+ vmr $H1,$R1
+ vmr $H2,$R2
+ vmr $H3,$R3
+ vmr $H4,$R4
+
+ bl __poly1305_mul # r^1:- * r^1:-
+
+ vpermdi $R0,$H0,$R0,0b00
+ vpermdi $R1,$H1,$R1,0b00
+ vpermdi $R2,$H2,$R2,0b00
+ vpermdi $R3,$H3,$R3,0b00
+ vpermdi $R4,$H4,$R4,0b00
+ vpermdi $H0,$H0,$H0,0b00
+ vpermdi $H1,$H1,$H1,0b00
+ vpermdi $H2,$H2,$H2,0b00
+ vpermdi $H3,$H3,$H3,0b00
+ vpermdi $H4,$H4,$H4,0b00
+ vsld $S1,$R1,$T0 # <<2
+ vsld $S2,$R2,$T0
+ vsld $S3,$R3,$T0
+ vsld $S4,$R4,$T0
+ vaddudm $S1,$S1,$R1
+ vaddudm $S2,$S2,$R2
+ vaddudm $S3,$S3,$R3
+ vaddudm $S4,$S4,$R4
+
+ bl __poly1305_mul # r^2:r^2 * r^2:r^1
+
+ addi $h0,$ctx,0x60
+ lwz $h1,0($ctx) # load hash
+ lwz $h2,4($ctx)
+ lwz $h3,8($ctx)
+ lwz $h4,12($ctx)
+ lwz $t0,16($ctx)
+
+ vmrgow $R0,$R0,$H0 # r^2:r^4:r^1:r^3
+ vmrgow $R1,$R1,$H1
+ vmrgow $R2,$R2,$H2
+ vmrgow $R3,$R3,$H3
+ vmrgow $R4,$R4,$H4
+ vslw $S1,$R1,$T0 # <<2
+ vslw $S2,$R2,$T0
+ vslw $S3,$R3,$T0
+ vslw $S4,$R4,$T0
+ vadduwm $S1,$S1,$R1
+ vadduwm $S2,$S2,$R2
+ vadduwm $S3,$S3,$R3
+ vadduwm $S4,$S4,$R4
+
+ stvx_u $R0,$x30,$ctx
+ stvx_u $R1,$x40,$ctx
+ stvx_u $S1,$x50,$ctx
+ stvx_u $R2,$x00,$h0
+ stvx_u $S2,$x10,$h0
+ stvx_u $R3,$x20,$h0
+ stvx_u $S3,$x30,$h0
+ stvx_u $R4,$x40,$h0
+ stvx_u $S4,$x50,$h0
+
+ extrwi $h0,$h1,26,6 # base 2^32 -> base 2^26
+ extrwi $h1,$h1,6,0
+ mtvrwz $H0,$h0
+ insrwi $h1,$h2,20,6
+ extrwi $h2,$h2,12,0
+ mtvrwz $H1,$h1
+ insrwi $h2,$h3,14,6
+ extrwi $h3,$h3,18,0
+ mtvrwz $H2,$h2
+ insrwi $h3,$h4,8,6
+ extrwi $h4,$h4,24,0
+ mtvrwz $H3,$h3
+ insrwi $h4,$t0,3,5
+ mtvrwz $H4,$h4
___
+ }
$code.=<<___;
- $POP r28,`$FRAME-$SIZE_T*4`($sp)
- $POP r29,`$FRAME-$SIZE_T*3`($sp)
- $POP r30,`$FRAME-$SIZE_T*2`($sp)
- $POP r31,`$FRAME-$SIZE_T*1`($sp)
- addi $sp,$sp,$FRAME
+ li r0,1
+ stw r0,24($ctx) # set is_base2_26
+ b Loaded_vsx
+
+.align 4
+Lskip_init_vsx:
+ li $x10,4
+ li $x20,8
+ li $x30,12
+ li $x40,16
+ lvwzx_u $H0,$x00,$ctx
+ lvwzx_u $H1,$x10,$ctx
+ lvwzx_u $H2,$x20,$ctx
+ lvwzx_u $H3,$x30,$ctx
+ lvwzx_u $H4,$x40,$ctx
+
+Loaded_vsx:
+ li $x10,0x10
+ li $x20,0x20
+ li $x30,0x30
+ li $x40,0x40
+ li $x50,0x50
+ li $x60,0x60
+ li $x70,0x70
+ addi $ctx_,$ctx,64 # &ctx->r[1]
+ addi $_ctx,$sp,`$LOCALS+15` # &ctx->r[1], r^2:r^4 shadow
+
+ vxor $T0,$T0,$T0 # ensure second half is zero
+ vpermdi $H0,$H0,$T0,0b00
+ vpermdi $H1,$H1,$T0,0b00
+ vpermdi $H2,$H2,$T0,0b00
+ vpermdi $H3,$H3,$T0,0b00
+ vpermdi $H4,$H4,$T0,0b00
+
+ be?lvx_u $_4,$x50,$const # byte swap mask
+ lvx_u $T1,$x00,$inp # load first input block
+ lvx_u $T2,$x10,$inp
+ lvx_u $T3,$x20,$inp
+ lvx_u $T4,$x30,$inp
+ be?vperm $T1,$T1,$T1,$_4
+ be?vperm $T2,$T2,$T2,$_4
+ be?vperm $T3,$T3,$T3,$_4
+ be?vperm $T4,$T4,$T4,$_4
+
+ vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
+ vspltisb $_4,4
+ vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
+ vspltisb $_14,14
+ vpermdi $I3,$T1,$T2,0b11
+
+ vsrd $I1,$I0,$_26
+ vsrd $I2,$I2,$_4
+ vsrd $I4,$I3,$_40
+ vsrd $I3,$I3,$_14
+ vand $I0,$I0,$mask26
+ vand $I1,$I1,$mask26
+ vand $I2,$I2,$mask26
+ vand $I3,$I3,$mask26
+
+ vpermdi $T1,$T3,$T4,0b00
+ vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
+ vpermdi $T3,$T3,$T4,0b11
+
+ vsrd $T0,$T1,$_26
+ vsrd $T2,$T2,$_4
+ vsrd $T4,$T3,$_40
+ vsrd $T3,$T3,$_14
+ vand $T1,$T1,$mask26
+ vand $T0,$T0,$mask26
+ vand $T2,$T2,$mask26
+ vand $T3,$T3,$mask26
+
+ # inp[2]:inp[0]:inp[3]:inp[1]
+ vmrgow $I4,$T4,$I4
+ vmrgow $I0,$T1,$I0
+ vmrgow $I1,$T0,$I1
+ vmrgow $I2,$T2,$I2
+ vmrgow $I3,$T3,$I3
+ vor $I4,$I4,$padbits
+
+ lvx_splt $R0,$x30,$ctx # taking lvx_vsplt out of loop
+ lvx_splt $R1,$x00,$ctx_ # gives ~8% improvement
+ lvx_splt $S1,$x10,$ctx_
+ lvx_splt $R2,$x20,$ctx_
+ lvx_splt $S2,$x30,$ctx_
+ lvx_splt $T1,$x40,$ctx_
+ lvx_splt $T2,$x50,$ctx_
+ lvx_splt $T3,$x60,$ctx_
+ lvx_splt $T4,$x70,$ctx_
+ stvx $R1,$x00,$_ctx
+ stvx $S1,$x10,$_ctx
+ stvx $R2,$x20,$_ctx
+ stvx $S2,$x30,$_ctx
+ stvx $T1,$x40,$_ctx
+ stvx $T2,$x50,$_ctx
+ stvx $T3,$x60,$_ctx
+ stvx $T4,$x70,$_ctx
+
+ addi $inp,$inp,0x40
+ addi $const,$const,0x50
+ addi r0,$len,-64
+ srdi r0,r0,6
+ mtctr r0
+ b Loop_vsx
+
+.align 4
+Loop_vsx:
+ ################################################################
+ ## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ ## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ ## \___________________/
+ ##
+ ## Note that we start with inp[2:3]*r^2. This is because it
+ ## doesn't depend on reduction in previous iteration.
+ ################################################################
+ ## d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ ## d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ ## d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ ## d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ ## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vmuleuw $ACC0,$I0,$R0
+ vmuleuw $ACC1,$I0,$R1
+ vmuleuw $ACC2,$I0,$R2
+ vmuleuw $ACC3,$I1,$R2
+
+ vmuleuw $T0,$I1,$R0
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I1,$R1
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $ACC4,$I2,$R2
+ vmuleuw $T0,$I4,$S1
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I2,$R1
+ vaddudm $ACC3,$ACC3,$T0
+ lvx $S3,$x50,$_ctx
+ vmuleuw $T0,$I3,$R1
+ vaddudm $ACC4,$ACC4,$T0
+ lvx $R3,$x40,$_ctx
+
+ vaddudm $H2,$H2,$I2
+ vaddudm $H0,$H0,$I0
+ vaddudm $H3,$H3,$I3
+ vaddudm $H1,$H1,$I1
+ vaddudm $H4,$H4,$I4
+
+ vmuleuw $T0,$I3,$S2
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I4,$S2
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I2,$R0
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I3,$R0
+ vaddudm $ACC3,$ACC3,$T0
+ lvx $S4,$x70,$_ctx
+ vmuleuw $T0,$I4,$R0
+ vaddudm $ACC4,$ACC4,$T0
+ lvx $R4,$x60,$_ctx
+
+ vmuleuw $T0,$I2,$S3
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I3,$S3
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I4,$S3
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I0,$R3
+ vaddudm $ACC3,$ACC3,$T0
+ vmuleuw $T0,$I1,$R3
+ vaddudm $ACC4,$ACC4,$T0
+
+ be?lvx_u $_4,$x00,$const # byte swap mask
+ lvx_u $T1,$x00,$inp # load next input block
+ lvx_u $T2,$x10,$inp
+ lvx_u $T3,$x20,$inp
+ lvx_u $T4,$x30,$inp
+ be?vperm $T1,$T1,$T1,$_4
+ be?vperm $T2,$T2,$T2,$_4
+ be?vperm $T3,$T3,$T3,$_4
+ be?vperm $T4,$T4,$T4,$_4
+
+ vmuleuw $T0,$I1,$S4
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I2,$S4
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I3,$S4
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I4,$S4
+ vaddudm $ACC3,$ACC3,$T0
+ vmuleuw $T0,$I0,$R4
+ vaddudm $ACC4,$ACC4,$T0
+
+ vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
+ vspltisb $_4,4
+ vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
+ vpermdi $I3,$T1,$T2,0b11
+
+ # (hash + inp[0:1]) * r^4
+ vmulouw $T0,$H0,$R0
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H1,$R0
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H2,$R0
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H3,$R0
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H4,$R0
+ vaddudm $ACC4,$ACC4,$T0
+
+ vpermdi $T1,$T3,$T4,0b00
+ vperm $T2,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
+ vpermdi $T3,$T3,$T4,0b11
+
+ vmulouw $T0,$H2,$S3
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H3,$S3
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H4,$S3
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H0,$R3
+ vaddudm $ACC3,$ACC3,$T0
+ lvx $S1,$x10,$_ctx
+ vmulouw $T0,$H1,$R3
+ vaddudm $ACC4,$ACC4,$T0
+ lvx $R1,$x00,$_ctx
+
+ vsrd $I1,$I0,$_26
+ vsrd $I2,$I2,$_4
+ vsrd $I4,$I3,$_40
+ vsrd $I3,$I3,$_14
+
+ vmulouw $T0,$H1,$S4
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H2,$S4
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H3,$S4
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H4,$S4
+ vaddudm $ACC3,$ACC3,$T0
+ lvx $S2,$x30,$_ctx
+ vmulouw $T0,$H0,$R4
+ vaddudm $ACC4,$ACC4,$T0
+ lvx $R2,$x20,$_ctx
+
+ vand $I0,$I0,$mask26
+ vand $I1,$I1,$mask26
+ vand $I2,$I2,$mask26
+ vand $I3,$I3,$mask26
+
+ vmulouw $T0,$H4,$S1
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H0,$R1
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H1,$R1
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H2,$R1
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H3,$R1
+ vaddudm $ACC4,$ACC4,$T0
+
+ vsrd $T2,$T2,$_4
+ vsrd $_4,$T1,$_26
+ vsrd $T4,$T3,$_40
+ vsrd $T3,$T3,$_14
+
+ vmulouw $T0,$H3,$S2
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H4,$S2
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H0,$R2
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H1,$R2
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H2,$R2
+ vaddudm $ACC4,$ACC4,$T0
+
+ vand $T1,$T1,$mask26
+ vand $_4,$_4,$mask26
+ vand $T2,$T2,$mask26
+ vand $T3,$T3,$mask26
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vspltisb $T0,2
+ vsrd $H4,$ACC3,$_26
+ vsrd $H1,$ACC0,$_26
+ vand $H3,$ACC3,$mask26
+ vand $H0,$ACC0,$mask26
+ vaddudm $H4,$H4,$ACC4 # h3 -> h4
+ vaddudm $H1,$H1,$ACC1 # h0 -> h1
+
+ vmrgow $I4,$T4,$I4
+ vmrgow $I0,$T1,$I0
+ vmrgow $I1,$_4,$I1
+ vmrgow $I2,$T2,$I2
+ vmrgow $I3,$T3,$I3
+ vor $I4,$I4,$padbits
+
+ vsrd $ACC4,$H4,$_26
+ vsrd $ACC1,$H1,$_26
+ vand $H4,$H4,$mask26
+ vand $H1,$H1,$mask26
+ vaddudm $H0,$H0,$ACC4
+ vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
+
+ vsld $ACC4,$ACC4,$T0 # <<2
+ vsrd $ACC2,$H2,$_26
+ vand $H2,$H2,$mask26
+ vaddudm $H0,$H0,$ACC4 # h4 -> h0
+ vaddudm $H3,$H3,$ACC2 # h2 -> h3
+
+ vsrd $ACC0,$H0,$_26
+ vsrd $ACC3,$H3,$_26
+ vand $H0,$H0,$mask26
+ vand $H3,$H3,$mask26
+ vaddudm $H1,$H1,$ACC0 # h0 -> h1
+ vaddudm $H4,$H4,$ACC3 # h3 -> h4
+
+ addi $inp,$inp,0x40
+ bdnz Loop_vsx
+
+ neg $len,$len
+ andi. $len,$len,0x30
+ sub $inp,$inp,$len
+
+ lvx_u $R0,$x30,$ctx # load all powers
+ lvx_u $R1,$x00,$ctx_
+ lvx_u $S1,$x10,$ctx_
+ lvx_u $R2,$x20,$ctx_
+ lvx_u $S2,$x30,$ctx_
+
+Last_vsx:
+ vmuleuw $ACC0,$I0,$R0
+ vmuleuw $ACC1,$I1,$R0
+ vmuleuw $ACC2,$I2,$R0
+ vmuleuw $ACC3,$I3,$R0
+ vmuleuw $ACC4,$I4,$R0
+
+ vmuleuw $T0,$I4,$S1
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I0,$R1
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I1,$R1
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I2,$R1
+ vaddudm $ACC3,$ACC3,$T0
+ lvx_u $S3,$x50,$ctx_
+ vmuleuw $T0,$I3,$R1
+ vaddudm $ACC4,$ACC4,$T0
+ lvx_u $R3,$x40,$ctx_
+
+ vaddudm $H2,$H2,$I2
+ vaddudm $H0,$H0,$I0
+ vaddudm $H3,$H3,$I3
+ vaddudm $H1,$H1,$I1
+ vaddudm $H4,$H4,$I4
+
+ vmuleuw $T0,$I3,$S2
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I4,$S2
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I0,$R2
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I1,$R2
+ vaddudm $ACC3,$ACC3,$T0
+ lvx_u $S4,$x70,$ctx_
+ vmuleuw $T0,$I2,$R2
+ vaddudm $ACC4,$ACC4,$T0
+ lvx_u $R4,$x60,$ctx_
+
+ vmuleuw $T0,$I2,$S3
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I3,$S3
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I4,$S3
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I0,$R3
+ vaddudm $ACC3,$ACC3,$T0
+ vmuleuw $T0,$I1,$R3
+ vaddudm $ACC4,$ACC4,$T0
+
+ vmuleuw $T0,$I1,$S4
+ vaddudm $ACC0,$ACC0,$T0
+ vmuleuw $T0,$I2,$S4
+ vaddudm $ACC1,$ACC1,$T0
+ vmuleuw $T0,$I3,$S4
+ vaddudm $ACC2,$ACC2,$T0
+ vmuleuw $T0,$I4,$S4
+ vaddudm $ACC3,$ACC3,$T0
+ vmuleuw $T0,$I0,$R4
+ vaddudm $ACC4,$ACC4,$T0
+
+ # (hash + inp[0:1]) * r^4
+ vmulouw $T0,$H0,$R0
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H1,$R0
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H2,$R0
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H3,$R0
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H4,$R0
+ vaddudm $ACC4,$ACC4,$T0
+
+ vmulouw $T0,$H2,$S3
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H3,$S3
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H4,$S3
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H0,$R3
+ vaddudm $ACC3,$ACC3,$T0
+ lvx_u $S1,$x10,$ctx_
+ vmulouw $T0,$H1,$R3
+ vaddudm $ACC4,$ACC4,$T0
+ lvx_u $R1,$x00,$ctx_
+
+ vmulouw $T0,$H1,$S4
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H2,$S4
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H3,$S4
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H4,$S4
+ vaddudm $ACC3,$ACC3,$T0
+ lvx_u $S2,$x30,$ctx_
+ vmulouw $T0,$H0,$R4
+ vaddudm $ACC4,$ACC4,$T0
+ lvx_u $R2,$x20,$ctx_
+
+ vmulouw $T0,$H4,$S1
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H0,$R1
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H1,$R1
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H2,$R1
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H3,$R1
+ vaddudm $ACC4,$ACC4,$T0
+
+ vmulouw $T0,$H3,$S2
+ vaddudm $ACC0,$ACC0,$T0
+ vmulouw $T0,$H4,$S2
+ vaddudm $ACC1,$ACC1,$T0
+ vmulouw $T0,$H0,$R2
+ vaddudm $ACC2,$ACC2,$T0
+ vmulouw $T0,$H1,$R2
+ vaddudm $ACC3,$ACC3,$T0
+ vmulouw $T0,$H2,$R2
+ vaddudm $ACC4,$ACC4,$T0
+
+ ################################################################
+ # horizontal addition
+
+ vpermdi $H0,$ACC0,$ACC0,0b10
+ vpermdi $H1,$ACC1,$ACC1,0b10
+ vpermdi $H2,$ACC2,$ACC2,0b10
+ vpermdi $H3,$ACC3,$ACC3,0b10
+ vpermdi $H4,$ACC4,$ACC4,0b10
+ vaddudm $ACC0,$ACC0,$H0
+ vaddudm $ACC1,$ACC1,$H1
+ vaddudm $ACC2,$ACC2,$H2
+ vaddudm $ACC3,$ACC3,$H3
+ vaddudm $ACC4,$ACC4,$H4
+
+ ################################################################
+ # lazy reduction
+
+ vspltisb $T0,2
+ vsrd $H4,$ACC3,$_26
+ vsrd $H1,$ACC0,$_26
+ vand $H3,$ACC3,$mask26
+ vand $H0,$ACC0,$mask26
+ vaddudm $H4,$H4,$ACC4 # h3 -> h4
+ vaddudm $H1,$H1,$ACC1 # h0 -> h1
+
+ vsrd $ACC4,$H4,$_26
+ vsrd $ACC1,$H1,$_26
+ vand $H4,$H4,$mask26
+ vand $H1,$H1,$mask26
+ vaddudm $H0,$H0,$ACC4
+ vaddudm $H2,$ACC2,$ACC1 # h1 -> h2
+
+ vsld $ACC4,$ACC4,$T0 # <<2
+ vsrd $ACC2,$H2,$_26
+ vand $H2,$H2,$mask26
+ vaddudm $H0,$H0,$ACC4 # h4 -> h0
+ vaddudm $H3,$H3,$ACC2 # h2 -> h3
+
+ vsrd $ACC0,$H0,$_26
+ vsrd $ACC3,$H3,$_26
+ vand $H0,$H0,$mask26
+ vand $H3,$H3,$mask26
+ vaddudm $H1,$H1,$ACC0 # h0 -> h1
+ vaddudm $H4,$H4,$ACC3 # h3 -> h4
+
+ beq Ldone_vsx
+
+ add r6,$const,$len
+
+ be?lvx_u $_4,$x00,$const # byte swap mask
+ lvx_u $T1,$x00,$inp # load last partial input block
+ lvx_u $T2,$x10,$inp
+ lvx_u $T3,$x20,$inp
+ lvx_u $T4,$x30,$inp
+ be?vperm $T1,$T1,$T1,$_4
+ be?vperm $T2,$T2,$T2,$_4
+ be?vperm $T3,$T3,$T3,$_4
+ be?vperm $T4,$T4,$T4,$_4
+
+ vpermdi $I0,$T1,$T2,0b00 # smash input to base 2^26
+ vspltisb $_4,4
+ vperm $I2,$T1,$T2,$I2perm # 0x...0e0f0001...1e1f1011
+ vpermdi $I3,$T1,$T2,0b11
+
+ vsrd $I1,$I0,$_26
+ vsrd $I2,$I2,$_4
+ vsrd $I4,$I3,$_40
+ vsrd $I3,$I3,$_14
+ vand $I0,$I0,$mask26
+ vand $I1,$I1,$mask26
+ vand $I2,$I2,$mask26
+ vand $I3,$I3,$mask26
+
+ vpermdi $T0,$T3,$T4,0b00
+ vperm $T1,$T3,$T4,$I2perm # 0x...0e0f0001...1e1f1011
+ vpermdi $T2,$T3,$T4,0b11
+
+ lvx_u $ACC0,$x00,r6
+ lvx_u $ACC1,$x30,r6
+
+ vsrd $T3,$T0,$_26
+ vsrd $T1,$T1,$_4
+ vsrd $T4,$T2,$_40
+ vsrd $T2,$T2,$_14
+ vand $T0,$T0,$mask26
+ vand $T3,$T3,$mask26
+ vand $T1,$T1,$mask26
+ vand $T2,$T2,$mask26
+
+ # inp[2]:inp[0]:inp[3]:inp[1]
+ vmrgow $I4,$T4,$I4
+ vmrgow $I0,$T0,$I0
+ vmrgow $I1,$T3,$I1
+ vmrgow $I2,$T1,$I2
+ vmrgow $I3,$T2,$I3
+ vor $I4,$I4,$padbits
+
+ vperm $H0,$H0,$H0,$ACC0 # move hash to right lane
+ vand $I0,$I0, $ACC1 # mask redundant input lane[s]
+ vperm $H1,$H1,$H1,$ACC0
+ vand $I1,$I1, $ACC1
+ vperm $H2,$H2,$H2,$ACC0
+ vand $I2,$I2, $ACC1
+ vperm $H3,$H3,$H3,$ACC0
+ vand $I3,$I3, $ACC1
+ vperm $H4,$H4,$H4,$ACC0
+ vand $I4,$I4, $ACC1
+
+ vaddudm $I0,$I0,$H0 # accumulate hash
+ vxor $H0,$H0,$H0 # wipe hash value
+ vaddudm $I1,$I1,$H1
+ vxor $H1,$H1,$H1
+ vaddudm $I2,$I2,$H2
+ vxor $H2,$H2,$H2
+ vaddudm $I3,$I3,$H3
+ vxor $H3,$H3,$H3
+ vaddudm $I4,$I4,$H4
+ vxor $H4,$H4,$H4
+
+ xor. $len,$len,$len
+ b Last_vsx
+
+.align 4
+Ldone_vsx:
+ $POP r0,`$VSXFRAME+$LRSAVE`($sp)
+ li $x10,4
+ li $x20,8
+ li $x30,12
+ li $x40,16
+ stvwx_u $H0,$x00,$ctx # store hash
+ stvwx_u $H1,$x10,$ctx
+ stvwx_u $H2,$x20,$ctx
+ stvwx_u $H3,$x30,$ctx
+ stvwx_u $H4,$x40,$ctx
+
+ lwz r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
+ mtlr r0
+ li r10,`15+$LOCALS+128`
+ li r11,`31+$LOCALS+128`
+ mtspr 256,r12 # restore vrsave
+ lvx v20,r10,$sp
+ addi r10,r10,32
+ lvx v21,r11,$sp
+ addi r11,r11,32
+ lvx v22,r10,$sp
+ addi r10,r10,32
+ lvx v23,r11,$sp
+ addi r11,r11,32
+ lvx v24,r10,$sp
+ addi r10,r10,32
+ lvx v25,r11,$sp
+ addi r11,r11,32
+ lvx v26,r10,$sp
+ addi r10,r10,32
+ lvx v27,r11,$sp
+ addi r11,r11,32
+ lvx v28,r10,$sp
+ addi r10,r10,32
+ lvx v29,r11,$sp
+ addi r11,r11,32
+ lvx v30,r10,$sp
+ lvx v31,r11,$sp
+ $POP r27,`$VSXFRAME-$SIZE_T*5`($sp)
+ $POP r28,`$VSXFRAME-$SIZE_T*4`($sp)
+ $POP r29,`$VSXFRAME-$SIZE_T*3`($sp)
+ $POP r30,`$VSXFRAME-$SIZE_T*2`($sp)
+ $POP r31,`$VSXFRAME-$SIZE_T*1`($sp)
+ addi $sp,$sp,$VSXFRAME
blr
.long 0
- .byte 0,12,4,1,0x80,4,3,0
-.size .poly1305_emit,.-.poly1305_emit
+ .byte 0,12,0x04,1,0x80,5,4,0
+ .long 0
+.size __poly1305_blocks_vsx,.-__poly1305_blocks_vsx
+
+.align 6
+LPICmeup:
+ mflr r0
+ bcl 20,31,\$+4
+ mflr $const # vvvvvv "distance" between . and 1st data entry
+ addi $const,$const,`64-8`
+ mtlr r0
+ blr
+ .long 0
+ .byte 0,12,0x14,0,0,0,0,0
+ .space `64-9*4`
+
+.quad 0x0000000003ffffff,0x0000000003ffffff # mask26
+.quad 0x000000000000001a,0x000000000000001a # _26
+.quad 0x0000000000000028,0x0000000000000028 # _40
+.quad 0x000000000e0f0001,0x000000001e1f1011 # I2perm
+.quad 0x0100000001000000,0x0100000001000000 # padbits
+.quad 0x0706050403020100,0x0f0e0d0c0b0a0908 # byte swap for big-endian
+
+.quad 0x0000000000000000,0x0000000004050607 # magic tail masks
+.quad 0x0405060700000000,0x0000000000000000
+.quad 0x0000000000000000,0x0405060700000000
+
+.quad 0xffffffff00000000,0xffffffffffffffff
+.quad 0xffffffff00000000,0xffffffff00000000
+.quad 0x0000000000000000,0xffffffff00000000
___
- }
+}}}
$code.=<<___;
-.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+.asciz "Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ # instructions prefixed with '?' are endian-specific and need
+ # to be adjusted accordingly...
+ if ($flavour !~ /le$/) { # big-endian
+ s/be\?// or
+ s/le\?/#le#/
+ } else { # little-endian
+ s/le\?// or
+ s/be\?/#be#/
+ }
+
+ print $_,"\n";
+}
close STDOUT or die "error closing STDOUT: $!";
diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl
index a9ab20714697..218708a46257 100755
--- a/crypto/poly1305/asm/poly1305-ppcfp.pl
+++ b/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -27,7 +27,10 @@
# POWER7 3.50/+30%
# POWER8 3.75/+10%
-$flavour = shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour =~ /64/) {
$SIZE_T =8;
@@ -54,7 +57,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+open STDOUT,"| $^X $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
$LOCALS=6*$SIZE_T;
$FRAME=$LOCALS+6*8+18*8;
diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl
index bcc8fd3b886a..4a93064ff663 100755
--- a/crypto/poly1305/asm/poly1305-s390x.pl
+++ b/crypto/poly1305/asm/poly1305-s390x.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
-# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -24,204 +24,961 @@
#
# On side note, z13 enables vector base 2^26 implementation...
-$flavour = shift;
+#
+# January 2019
+#
+# Add vx code path (base 2^26).
+#
+# Copyright IBM Corp. 2019
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+#
+# January 2019
+#
+# Add vector base 2^26 implementation. It's problematic to accurately
+# measure performance, because reference system is hardly idle. But
+# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
+# >=20% faster than IBM's submission on long inputs, and much faster on
+# short ones, because calculation of key powers is postponed till we
+# know that input is long enough to justify the additional overhead.
+
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :GE :EI :MI1 :VX AUTOLOAD LABEL INCLUDE);
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
+ $z=0; # S/390 ABI
$SIZE_T=4;
- $g="";
} else {
+ $z=1; # zSeries ABI
$SIZE_T=8;
- $g="g";
}
-while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$sp="%r15";
+my $stdframe=16*$SIZE_T+4*8;
+my $sp="%r15";
my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
-$code.=<<___;
-.text
-
-.globl poly1305_init
-.type poly1305_init,\@function
-.align 16
-poly1305_init:
- lghi %r0,0
- lghi %r1,-1
- stg %r0,0($ctx) # zero hash value
- stg %r0,8($ctx)
- stg %r0,16($ctx)
-
- cl${g}r $inp,%r0
- je .Lno_key
-
- lrvg %r4,0($inp) # load little-endian key
- lrvg %r5,8($inp)
-
- nihl %r1,0xffc0 # 0xffffffc0ffffffff
- srlg %r0,%r1,4 # 0x0ffffffc0fffffff
- srlg %r1,%r1,4
- nill %r1,0xfffc # 0x0ffffffc0ffffffc
-
- ngr %r4,%r0
- ngr %r5,%r1
-
- stg %r4,32($ctx)
- stg %r5,40($ctx)
-
-.Lno_key:
- lghi %r2,0
- br %r14
-.size poly1305_init,.-poly1305_init
-___
+PERLASM_BEGIN($output);
+
+INCLUDE ("s390x_arch.h");
+TEXT ();
+
+################
+# static void poly1305_init(void *ctx, const unsigned char key[16])
+{
+GLOBL ("poly1305_init");
+TYPE ("poly1305_init","\@function");
+ALIGN (16);
+LABEL ("poly1305_init");
+ lghi ("%r0",0);
+ lghi ("%r1",-1);
+ stg ("%r0","0($ctx)"); # zero hash value
+ stg ("%r0","8($ctx)");
+ stg ("%r0","16($ctx)");
+ st ("%r0","24($ctx)"); # clear is_base2_26
+ lgr ("%r5",$ctx); # reassign $ctx
+ lghi ("%r2",0);
+
+&{$z? \&clgr:\&clr} ($inp,"%r0");
+ je (".Lno_key");
+
+ lrvg ("%r2","0($inp)"); # load little-endian key
+ lrvg ("%r3","8($inp)");
+
+ nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
+ srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
+ srlg ("%r1","%r1",4);
+ nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
+
+ ngr ("%r2","%r0");
+ ngr ("%r3","%r1");
+
+ stmg ("%r2","%r3","32(%r5)");
+
+ larl ("%r1","OPENSSL_s390xcap_P");
+ lg ("%r0","16(%r1)");
+ srlg ("%r0","%r0",62);
+ nill ("%r0",1); # extract vx bit
+ lcgr ("%r0","%r0");
+ larl ("%r1",".Lpoly1305_blocks");
+ larl ("%r2",".Lpoly1305_blocks_vx");
+ larl ("%r3",".Lpoly1305_emit");
+&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
+&{$z? \&ngr:\&nr} ("%r2","%r0");
+&{$z? \&xgr:\&xr} ("%r2","%r1");
+&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
+ lghi ("%r2",1);
+LABEL (".Lno_key");
+ br ("%r14");
+SIZE ("poly1305_init",".-poly1305_init");
+}
+
+################
+# static void poly1305_blocks(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
{
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));
-$code.=<<___;
-.globl poly1305_blocks
-.type poly1305_blocks,\@function
-.align 16
-poly1305_blocks:
- srl${g} $len,4 # fixed-up in 64-bit build
- lghi %r0,0
- cl${g}r $len,%r0
- je .Lno_data
-
- stm${g} %r6,%r14,`6*$SIZE_T`($sp)
-
- llgfr $padbit,$padbit # clear upper half, much needed with
- # non-64-bit ABI
- lg $r0,32($ctx) # load key
- lg $r1,40($ctx)
-
- lg $h0,0($ctx) # load hash value
- lg $h1,8($ctx)
- lg $h2,16($ctx)
-
- st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
- srlg $s1,$r1,2
- algr $s1,$r1 # s1 = r1 + r1>>2
- j .Loop
-
-.align 16
-.Loop:
- lrvg $d0lo,0($inp) # load little-endian input
- lrvg $d1lo,8($inp)
- la $inp,16($inp)
-
- algr $d0lo,$h0 # accumulate input
- alcgr $d1lo,$h1
-
- lgr $h0,$d0lo
- mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
- lgr $h1,$d1lo
- mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
-
- mlgr $t0,$r1 # h0*r1 -> $t0:$h0
- mlgr $t1,$r0 # h1*r0 -> $t1:$h1
- alcgr $h2,$padbit
-
- algr $d0lo,$d1lo
- lgr $d1lo,$h2
- alcgr $d0hi,$d1hi
- lghi $d1hi,0
-
- algr $h1,$h0
- alcgr $t1,$t0
-
- msgr $d1lo,$s1 # h2*s1
- msgr $h2,$r0 # h2*r0
-
- algr $h1,$d1lo
- alcgr $t1,$d1hi # $d1hi is zero
-
- algr $h1,$d0hi
- alcgr $h2,$t1
-
- lghi $h0,-4 # final reduction step
- ngr $h0,$h2
- srlg $t0,$h2,2
- algr $h0,$t0
- lghi $t1,3
- ngr $h2,$t1
-
- algr $h0,$d0lo
- alcgr $h1,$d1hi # $d1hi is still zero
- alcgr $h2,$d1hi # $d1hi is still zero
-
- brct$g $len,.Loop
-
- l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
-
- stg $h0,0($ctx) # store hash value
- stg $h1,8($ctx)
- stg $h2,16($ctx)
-
- lm${g} %r6,%r14,`6*$SIZE_T`($sp)
-.Lno_data:
- br %r14
-.size poly1305_blocks,.-poly1305_blocks
-___
+GLOBL ("poly1305_blocks");
+TYPE ("poly1305_blocks","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks");
+LABEL (".Lpoly1305_blocks");
+&{$z? \&ltgr:\&ltr} ("%r0",$len);
+ jz (".Lno_data");
+
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
+
+ lg ($h0,"0($ctx)"); # load hash value
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
+
+LABEL (".Lpoly1305_blocks_entry");
+if ($z) {
+ srlg ($len,$len,4);
+} else {
+ srl ($len,4);
+}
+ llgfr ($padbit,$padbit); # clear upper half, much needed with
+ # non-64-bit ABI
+ lg ($r0,"32($ctx)"); # load key
+ lg ($r1,"40($ctx)");
+
+&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
+ srlg ($s1,$r1,2);
+ algr ($s1,$r1); # s1 = r1 + r1>>2
+ j (".Loop");
+
+ALIGN (16);
+LABEL (".Loop");
+ lrvg ($d0lo,"0($inp)"); # load little-endian input
+ lrvg ($d1lo,"8($inp)");
+ la ($inp,"16($inp)");
+
+ algr ($d0lo,$h0); # accumulate input
+ alcgr ($d1lo,$h1);
+ alcgr ($h2,$padbit);
+
+ lgr ($h0,$d0lo);
+ mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
+ lgr ($h1,$d1lo);
+ mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
+
+ mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
+ mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
+
+ algr ($d0lo,$d1lo);
+ lgr ($d1lo,$h2);
+ alcgr ($d0hi,$d1hi);
+ lghi ($d1hi,0);
+
+ algr ($h1,$h0);
+ alcgr ($t1,$t0);
+
+ msgr ($d1lo,$s1); # h2*s1
+ msgr ($h2,$r0); # h2*r0
+
+ algr ($h1,$d1lo);
+ alcgr ($t1,$d1hi); # $d1hi is zero
+
+ algr ($h1,$d0hi);
+ alcgr ($h2,$t1);
+
+ lghi ($h0,-4); # final reduction step
+ ngr ($h0,$h2);
+ srlg ($t0,$h2,2);
+ algr ($h0,$t0);
+ lghi ($t1,3);
+ ngr ($h2,$t1);
+
+ algr ($h0,$d0lo);
+ alcgr ($h1,$d1hi); # $d1hi is still zero
+ alcgr ($h2,$d1hi); # $d1hi is still zero
+
+&{$z? \&brctg:\&brct} ($len,".Loop");
+
+&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
+
+ stg ($h0,"0($ctx)"); # store hash value
+ stg ($h1,"8($ctx)");
+ stg ($h2,"16($ctx)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
+LABEL (".Lno_data");
+ br ("%r14");
+SIZE ("poly1305_blocks",".-poly1305_blocks");
+}
+
+################
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
+{
+my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
+my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
+my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
+my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
+my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
+my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
+my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
+
+my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
+
+TYPE ("poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks_vx");
+LABEL (".Lpoly1305_blocks_vx");
+&{$z? \&clgfi:\&clfi} ($len,128);
+ jhe ("__poly1305_blocks_vx");
+
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
+
+ lg ($d0,"0($ctx)");
+ lg ($d1,"8($ctx)");
+ lg ($d2,"16($ctx)");
+
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
+ srlg ($h0,$d0,32);
+ llgfr ("%r1",$d1);
+ srlg ($h1,$d1,32);
+ srlg ($h2,$d2,32);
+
+ sllg ("%r0","%r0",26);
+ algr ($h0,"%r0");
+ sllg ("%r0",$h1,52);
+ srlg ($h1,$h1,12);
+ sllg ("%r1","%r1",14);
+ algr ($h0,"%r0");
+ alcgr ($h1,"%r1");
+ sllg ("%r0",$h2,40);
+ srlg ($h2,$h2,24);
+ lghi ("%r1",0);
+ algr ($h1,"%r0");
+ alcgr ($h2,"%r1");
+
+ llgf ("%r0","24($ctx)"); # is_base2_26
+ lcgr ("%r0","%r0");
+
+ xgr ($h0,$d0); # choose between radixes
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+ ngr ($h0,"%r0");
+ ngr ($h1,"%r0");
+ ngr ($h2,"%r0");
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+
+ lhi ("%r0",0);
+ st ("%r0","24($ctx)"); # clear is_base2_26
+
+ j (".Lpoly1305_blocks_entry");
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
+
+TYPE ("__poly1305_mul","\@function");
+ALIGN (16);
+LABEL ("__poly1305_mul");
+ vmlof ($ACC0,$H0,$R0);
+ vmlof ($ACC1,$H0,$R1);
+ vmlof ($ACC2,$H0,$R2);
+ vmlof ($ACC3,$H0,$R3);
+ vmlof ($ACC4,$H0,$R4);
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # lazy reduction
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+ br ("%r14");
+SIZE ("__poly1305_mul",".-__poly1305_mul");
+
+TYPE ("__poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("__poly1305_blocks_vx");
+&{$z? \&lgr:\&lr} ("%r0",$sp);
+&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
+if (!$z) {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+ ahi ($sp,-$stdframe);
+ st ("%r0","0($sp)"); # back-chain
+
+ llgfr ($len,$len); # so that srlg works on $len
+} else {
+ aghi ($sp,"-($stdframe+8*8)");
+ stg ("%r0","0($sp)"); # back-chain
+
+ std ("%f8","$stdframe+0*8($sp)");
+ std ("%f9","$stdframe+1*8($sp)");
+ std ("%f10","$stdframe+2*8($sp)");
+ std ("%f11","$stdframe+3*8($sp)");
+ std ("%f12","$stdframe+4*8($sp)");
+ std ("%f13","$stdframe+5*8($sp)");
+ std ("%f14","$stdframe+6*8($sp)");
+ std ("%f15","$stdframe+7*8($sp)");
+}
+ larl ("%r1",".Lconst");
+ vgmg ($mask26,38,63);
+ vlm ($bswaplo,$bswapmi,"16(%r1)");
+
+ &lt ("%r0","24($ctx)"); # is_base2_26?
+ jnz (".Lskip_init");
+
+ lg ($h0,"32($ctx)"); # load key base 2^64
+ lg ($h1,"40($ctx)");
+
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
+ srlg ($d1,$h0,52);
+ risbg ($h0,$h0,38,0x80+63,0);
+ vlvgg ($R0,$h0,0);
+ risbg ($d1,$h1,38,51,12);
+ vlvgg ($R1,$d0,0);
+ risbg ($d0,$h1,38,63,50);
+ vlvgg ($R2,$d1,0);
+ srlg ($d1,$h1,40);
+ vlvgg ($R3,$d0,0);
+ vlvgg ($R4,$d1,0);
+
+ veslg ($S1,$R1,2);
+ veslg ($S2,$R2,2);
+ veslg ($S3,$R3,2);
+ veslg ($S4,$R4,2);
+ vlr ($H0,$R0);
+ vlr ($H1,$R1);
+ vlr ($H2,$R2);
+ vlr ($H3,$R3);
+ vlr ($H4,$R4);
+ vag ($S1,$S1,$R1); # * 5
+ vag ($S2,$S2,$R2);
+ vag ($S3,$S3,$R3);
+ vag ($S4,$S4,$R4);
+
+ brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
+
+ vpdi ($R0,$H0,$R0,0); # r^2:r^1
+ vpdi ($R1,$H1,$R1,0);
+ vpdi ($R2,$H2,$R2,0);
+ vpdi ($R3,$H3,$R3,0);
+ vpdi ($R4,$H4,$R4,0);
+ vpdi ($H0,$H0,$H0,0); # r^2:r^2
+ vpdi ($H1,$H1,$H1,0);
+ vpdi ($H2,$H2,$H2,0);
+ vpdi ($H3,$H3,$H3,0);
+ vpdi ($H4,$H4,$H4,0);
+ veslg ($S1,$R1,2);
+ veslg ($S2,$R2,2);
+ veslg ($S3,$R3,2);
+ veslg ($S4,$R4,2);
+ vag ($S1,$S1,$R1); # * 5
+ vag ($S2,$S2,$R2);
+ vag ($S3,$S3,$R3);
+ vag ($S4,$S4,$R4);
+
+ brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
+
+ vl ($I0,"0(%r1)"); # borrow $I0
+ vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
+ vperm ($R1,$R1,$H1,$I0);
+ vperm ($R2,$R2,$H2,$I0);
+ vperm ($R3,$R3,$H3,$I0);
+ vperm ($R4,$R4,$H4,$I0);
+ veslf ($S1,$R1,2);
+ veslf ($S2,$R2,2);
+ veslf ($S3,$R3,2);
+ veslf ($S4,$R4,2);
+ vaf ($S1,$S1,$R1); # * 5
+ vaf ($S2,$S2,$R2);
+ vaf ($S3,$S3,$R3);
+ vaf ($S4,$S4,$R4);
+
+ lg ($h0,"0($ctx)"); # load hash base 2^64
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
+
+ vzero ($H0);
+ vzero ($H1);
+ vzero ($H2);
+ vzero ($H3);
+ vzero ($H4);
+
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
+ srlg ($d1,$h0,52);
+ risbg ($h0,$h0,38,0x80+63,0);
+ vlvgg ($H0,$h0,0);
+ risbg ($d1,$h1,38,51,12);
+ vlvgg ($H1,$d0,0);
+ risbg ($d0,$h1,38,63,50);
+ vlvgg ($H2,$d1,0);
+ srlg ($d1,$h1,40);
+ vlvgg ($H3,$d0,0);
+ risbg ($d1,$h2,37,39,24);
+ vlvgg ($H4,$d1,0);
+
+ lhi ("%r0",1);
+ st ("%r0","24($ctx)"); # set is_base2_26
+
+ vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
+
+ vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
+ vpdi ($R1,$R1,$R1,0);
+ vpdi ($S1,$S1,$S1,0);
+ vpdi ($R2,$R2,$R2,0);
+ vpdi ($S2,$S2,$S2,0);
+ vpdi ($R3,$R3,$R3,0);
+ vpdi ($S3,$S3,$S3,0);
+ vpdi ($R4,$R4,$R4,0);
+ vpdi ($S4,$S4,$S4,0);
+
+ j (".Loaded_hash");
+
+ALIGN (16);
+LABEL (".Lskip_init");
+ vllezf ($H0,"0($ctx)"); # load hash base 2^26
+ vllezf ($H1,"4($ctx)");
+ vllezf ($H2,"8($ctx)");
+ vllezf ($H3,"12($ctx)");
+ vllezf ($H4,"16($ctx)");
+
+ vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
+ vlrepg ($R1,"0x40($ctx)");
+ vlrepg ($S1,"0x50($ctx)");
+ vlrepg ($R2,"0x60($ctx)");
+ vlrepg ($S2,"0x70($ctx)");
+ vlrepg ($R3,"0x80($ctx)");
+ vlrepg ($S3,"0x90($ctx)");
+ vlrepg ($R4,"0xa0($ctx)");
+ vlrepg ($S4,"0xb0($ctx)");
+
+LABEL (".Loaded_hash");
+ vzero ($I1);
+ vzero ($I3);
+
+ vlm ($T1,$T4,"0x00($inp)"); # load first input block
+ la ($inp,"0x40($inp)");
+ vgmg ($mask26,6,31);
+ vgmf ($I4,5,5); # padbit<<2
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ srlg ("%r0",$len,6);
+&{$z? \&aghi:\&ahi} ("%r0",-1);
+
+ALIGN (16);
+LABEL (".Loop_vx");
+ vmlef ($ACC0,$I0,$R0);
+ vmlef ($ACC1,$I0,$R1);
+ vmlef ($ACC2,$I0,$R2);
+ vmlef ($ACC3,$I0,$R3);
+ vmlef ($ACC4,$I0,$R4);
+
+ vmalef ($ACC0,$I1,$S4,$ACC0);
+ vmalef ($ACC1,$I1,$R0,$ACC1);
+ vmalef ($ACC2,$I1,$R1,$ACC2);
+ vmalef ($ACC3,$I1,$R2,$ACC3);
+ vmalef ($ACC4,$I1,$R3,$ACC4);
+
+ vaf ($H2,$H2,$I2);
+ vaf ($H0,$H0,$I0);
+ vaf ($H3,$H3,$I3);
+ vaf ($H1,$H1,$I1);
+ vaf ($H4,$H4,$I4);
+
+ vmalef ($ACC0,$I2,$S3,$ACC0);
+ vmalef ($ACC1,$I2,$S4,$ACC1);
+ vmalef ($ACC2,$I2,$R0,$ACC2);
+ vmalef ($ACC3,$I2,$R1,$ACC3);
+ vmalef ($ACC4,$I2,$R2,$ACC4);
+
+ vlm ($T1,$T4,"0x00($inp)"); # load next input block
+ la ($inp,"0x40($inp)");
+ vgmg ($mask26,6,31);
+
+ vmalef ($ACC0,$I3,$S2,$ACC0);
+ vmalef ($ACC1,$I3,$S3,$ACC1);
+ vmalef ($ACC2,$I3,$S4,$ACC2);
+ vmalef ($ACC3,$I3,$R0,$ACC3);
+ vmalef ($ACC4,$I3,$R1,$ACC4);
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ vmalef ($ACC0,$I4,$S1,$ACC0);
+ vmalef ($ACC1,$I4,$S2,$ACC1);
+ vmalef ($ACC2,$I4,$S3,$ACC2);
+ vmalef ($ACC3,$I4,$S4,$ACC3);
+ vmalef ($ACC4,$I4,$R0,$ACC4);
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+
+ vmalof ($ACC0,$H0,$R0,$ACC0);
+ vmalof ($ACC1,$H0,$R1,$ACC1);
+ vmalof ($ACC2,$H0,$R2,$ACC2);
+ vmalof ($ACC3,$H0,$R3,$ACC3);
+ vmalof ($ACC4,$H0,$R4,$ACC4);
+
+ vgmf ($I4,5,5); # padbit<<2
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+
+&{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
+
+ vlm ($R0,$S4,"48($ctx)"); # load all powers
+
+ lghi ("%r0",0x30);
+&{$z? \&lcgr:\&lcr} ($len,$len);
+&{$z? \&ngr:\&nr} ($len,"%r0");
+&{$z? \&slgr:\&slr} ($inp,$len);
+
+LABEL (".Last");
+ vmlef ($ACC0,$I0,$R0);
+ vmlef ($ACC1,$I0,$R1);
+ vmlef ($ACC2,$I0,$R2);
+ vmlef ($ACC3,$I0,$R3);
+ vmlef ($ACC4,$I0,$R4);
+
+ vmalef ($ACC0,$I1,$S4,$ACC0);
+ vmalef ($ACC1,$I1,$R0,$ACC1);
+ vmalef ($ACC2,$I1,$R1,$ACC2);
+ vmalef ($ACC3,$I1,$R2,$ACC3);
+ vmalef ($ACC4,$I1,$R3,$ACC4);
+
+ vaf ($H0,$H0,$I0);
+ vaf ($H1,$H1,$I1);
+ vaf ($H2,$H2,$I2);
+ vaf ($H3,$H3,$I3);
+ vaf ($H4,$H4,$I4);
+
+ vmalef ($ACC0,$I2,$S3,$ACC0);
+ vmalef ($ACC1,$I2,$S4,$ACC1);
+ vmalef ($ACC2,$I2,$R0,$ACC2);
+ vmalef ($ACC3,$I2,$R1,$ACC3);
+ vmalef ($ACC4,$I2,$R2,$ACC4);
+
+ vmalef ($ACC0,$I3,$S2,$ACC0);
+ vmalef ($ACC1,$I3,$S3,$ACC1);
+ vmalef ($ACC2,$I3,$S4,$ACC2);
+ vmalef ($ACC3,$I3,$R0,$ACC3);
+ vmalef ($ACC4,$I3,$R1,$ACC4);
+
+ vmalef ($ACC0,$I4,$S1,$ACC0);
+ vmalef ($ACC1,$I4,$S2,$ACC1);
+ vmalef ($ACC2,$I4,$S3,$ACC2);
+ vmalef ($ACC3,$I4,$S4,$ACC3);
+ vmalef ($ACC4,$I4,$R0,$ACC4);
+
+ vmalof ($ACC0,$H0,$R0,$ACC0);
+ vmalof ($ACC1,$H0,$R1,$ACC1);
+ vmalof ($ACC2,$H0,$R2,$ACC2);
+ vmalof ($ACC3,$H0,$R3,$ACC3);
+ vmalof ($ACC4,$H0,$R4,$ACC4);
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # horizontal addition
+
+ vzero ($H0);
+ vsumqg ($ACC0,$ACC0,$H0);
+ vsumqg ($ACC1,$ACC1,$H0);
+ vsumqg ($ACC2,$ACC2,$H0);
+ vsumqg ($ACC3,$ACC3,$H0);
+ vsumqg ($ACC4,$ACC4,$H0);
+
+ ################################################################
+ # lazy reduction
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+
+&{$z? \&clgfi:\&clfi} ($len,0);
+ je (".Ldone");
+
+ vlm ($T1,$T4,"0x00($inp)"); # load last partial block
+ vgmg ($mask26,6,31);
+ vgmf ($I4,5,5); # padbit<<2
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
+ vl ($ACC1,"0x60($len,%r1)");
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
+ vn ($I0,$I0,$ACC1); # mask redundant lane[s]
+ vperm ($H1,$H1,$H1,$ACC0);
+ vn ($I1,$I1,$ACC1);
+ vperm ($H2,$H2,$H2,$ACC0);
+ vn ($I2,$I2,$ACC1);
+ vperm ($H3,$H3,$H3,$ACC0);
+ vn ($I3,$I3,$ACC1);
+ vperm ($H4,$H4,$H4,$ACC0);
+ vn ($I4,$I4,$ACC1);
+
+ vaf ($I0,$I0,$H0); # accumulate hash
+ vzero ($H0); # wipe hash value
+ vaf ($I1,$I1,$H1);
+ vzero ($H1);
+ vaf ($I2,$I2,$H2);
+ vzero ($H2);
+ vaf ($I3,$I3,$H3);
+ vzero ($H3);
+ vaf ($I4,$I4,$H4);
+ vzero ($H4);
+
+&{$z? \&lghi:\&lhi} ($len,0);
+ j (".Last");
+ # I don't bother to tell apart cases when only one multiplication
+ # pass is sufficient, because I argue that mispredicted branch
+ # penalties are comparable to overhead of sometimes redundant
+ # multiplication pass...
+
+LABEL (".Ldone");
+ vstef ($H0,"0($ctx)",3); # store hash base 2^26
+ vstef ($H1,"4($ctx)",3);
+ vstef ($H2,"8($ctx)",3);
+ vstef ($H3,"12($ctx)",3);
+ vstef ($H4,"16($ctx)",3);
+
+if ($z) {
+ ld ("%f8","$stdframe+0*8($sp)");
+ ld ("%f9","$stdframe+1*8($sp)");
+ ld ("%f10","$stdframe+2*8($sp)");
+ ld ("%f11","$stdframe+3*8($sp)");
+ ld ("%f12","$stdframe+4*8($sp)");
+ ld ("%f13","$stdframe+5*8($sp)");
+ ld ("%f14","$stdframe+6*8($sp)");
+ ld ("%f15","$stdframe+7*8($sp)");
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
+} else {
+ ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
+}
+ br ("%r14");
+SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
}
+
+################
+# static void poly1305_emit(void *ctx, unsigned char mac[16],
+# const u32 nonce[4])
{
my ($mac,$nonce)=($inp,$len);
-my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
-
-$code.=<<___;
-.globl poly1305_emit
-.type poly1305_emit,\@function
-.align 16
-poly1305_emit:
- stm${g} %r6,%r9,`6*$SIZE_T`($sp)
-
- lg $h0,0($ctx)
- lg $h1,8($ctx)
- lg $h2,16($ctx)
-
- lghi %r0,5
- lghi %r1,0
- lgr $d0,$h0
- lgr $d1,$h1
-
- algr $h0,%r0 # compare to modulus
- alcgr $h1,%r1
- alcgr $h2,%r1
-
- srlg $h2,$h2,2 # did it borrow/carry?
- slgr %r1,$h2 # 0-$h2>>2
- lg $h2,0($nonce) # load nonce
- lghi %r0,-1
- lg $ctx,8($nonce)
- xgr %r0,%r1 # ~%r1
-
- ngr $h0,%r1
- ngr $d0,%r0
- ngr $h1,%r1
- ngr $d1,%r0
- ogr $h0,$d0
- rllg $d0,$h2,32 # flip nonce words
- ogr $h1,$d1
- rllg $d1,$ctx,32
-
- algr $h0,$d0 # accumulate nonce
- alcgr $h1,$d1
-
- strvg $h0,0($mac) # write little-endian result
- strvg $h1,8($mac)
-
- lm${g} %r6,%r9,`6*$SIZE_T`($sp)
- br %r14
-.size poly1305_emit,.-poly1305_emit
-
-.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
+my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
+
+GLOBL ("poly1305_emit");
+TYPE ("poly1305_emit","\@function");
+ALIGN (16);
+LABEL ("poly1305_emit");
+LABEL (".Lpoly1305_emit");
+&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
+
+ lg ($d0,"0($ctx)");
+ lg ($d1,"8($ctx)");
+ lg ($d2,"16($ctx)");
+
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
+ srlg ($h0,$d0,32);
+ llgfr ("%r1",$d1);
+ srlg ($h1,$d1,32);
+ srlg ($h2,$d2,32);
+
+ sllg ("%r0","%r0",26);
+ algr ($h0,"%r0");
+ sllg ("%r0",$h1,52);
+ srlg ($h1,$h1,12);
+ sllg ("%r1","%r1",14);
+ algr ($h0,"%r0");
+ alcgr ($h1,"%r1");
+ sllg ("%r0",$h2,40);
+ srlg ($h2,$h2,24);
+ lghi ("%r1",0);
+ algr ($h1,"%r0");
+ alcgr ($h2,"%r1");
+
+ llgf ("%r0","24($ctx)"); # is_base2_26
+ lcgr ("%r0","%r0");
+
+ xgr ($h0,$d0); # choose between radixes
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+ ngr ($h0,"%r0");
+ ngr ($h1,"%r0");
+ ngr ($h2,"%r0");
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+
+ lghi ("%r0",5);
+ lgr ($d0,$h0);
+ lgr ($d1,$h1);
+
+ algr ($h0,"%r0"); # compare to modulus
+ alcgr ($h1,"%r1");
+ alcgr ($h2,"%r1");
+
+ srlg ($h2,$h2,2); # did it borrow/carry?
+ slgr ("%r1",$h2); # 0-$h2>>2
+ lg ($d2,"0($nonce)"); # load nonce
+ lg ($ctx,"8($nonce)");
+
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ ngr ($h0,"%r1");
+ ngr ($h1,"%r1");
+ xgr ($h0,$d0);
+ rllg ($d0,$d2,32); # flip nonce words
+ xgr ($h1,$d1);
+ rllg ($d1,$ctx,32);
+
+ algr ($h0,$d0); # accumulate nonce
+ alcgr ($h1,$d1);
+
+ strvg ($h0,"0($mac)"); # write little-endian result
+ strvg ($h1,"8($mac)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
+ br ("%r14");
+SIZE ("poly1305_emit",".-poly1305_emit");
}
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
+################
+
+ALIGN (16);
+LABEL (".Lconst");
+LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
+LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
+LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
+LONG (0x00000000,0x09080706,0x00000000,0x19181716);
+
+LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
+LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
+LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
+
+LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
+LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
+LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
+
+STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
-print $code;
-close STDOUT or die "error closing STDOUT: $!";
+PERLASM_END();
diff --git a/crypto/poly1305/asm/poly1305-sparcv9.pl b/crypto/poly1305/asm/poly1305-sparcv9.pl
index 997e0d8344c6..dc592a07acac 100755
--- a/crypto/poly1305/asm/poly1305-sparcv9.pl
+++ b/crypto/poly1305/asm/poly1305-sparcv9.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
-# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -41,19 +41,21 @@
# (***) Multi-process benchmark saturates at ~12.5x single-process
# result on 8-core processor, or ~21GBps per 2.85GHz socket.
-my $output = pop;
-open STDOUT,">$output";
+# $output is the last argument if it looks like a file (it has an extension)
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+
+open STDOUT,">$output" if $output;
my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
-my $output = pop;
-open STDOUT,">$stdout";
-
$code.=<<___;
-#include "sparc_arch.h"
+#ifndef __ASSEMBLER__
+# define __ASSEMBLER__ 1
+#endif
+#include "crypto/sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl
index 2ae16a230b66..c91d01fb3ba4 100755
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -47,8 +47,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-$output=pop;
-open STDOUT,">$output";
+$output=pop and open STDOUT,">$output";
&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl
index 5f834d8faf2a..4cddca1c514c 100755
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -1,7 +1,7 @@
#! /usr/bin/env perl
-# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved.
#
-# Licensed under the OpenSSL license (the "License"). You may not use
+# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
@@ -63,9 +63,10 @@
# (***) strangely enough performance seems to vary from core to core,
# listed result is best case;
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
@@ -94,7 +95,8 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
@@ -193,7 +195,7 @@ $code.=<<___ if ($avx>1);
bt \$`5+32`,%r9 # AVX2?
cmovc %rax,%r10
___
-$code.=<<___ if ($avx>3);
+$code.=<<___ if ($avx>3 && !$win64);
mov \$`(1<<31|1<<21|1<<16)`,%rax
shr \$32,%r9
and %rax,%r9
@@ -2722,7 +2724,7 @@ $code.=<<___;
.cfi_endproc
.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
___
-if ($avx>3) {
+if ($avx>3 && !$win64) {
########################################################################
# VPMADD52 version using 2^44 radix.
#
@@ -2806,6 +2808,7 @@ $code.=<<___;
.align 32
poly1305_blocks_vpmadd52:
.cfi_startproc
+ endbranch
shr \$4,$len
jz .Lno_data_vpmadd52 # too short
@@ -3739,6 +3742,7 @@ $code.=<<___;
.align 32
poly1305_emit_base2_44:
.cfi_startproc
+ endbranch
mov 0($ctx),%r8 # load hash value
mov 8($ctx),%r9
mov 16($ctx),%r10