aboutsummaryrefslogtreecommitdiff
path: root/sys/dev/if_wg/module/crypto/zinc/chacha20
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/if_wg/module/crypto/zinc/chacha20')
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c98
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl1227
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl1163
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c27
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S424
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c132
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl4106
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c238
8 files changed, 0 insertions, 7415 deletions
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c
deleted file mode 100644
index 41e2e79abb2b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#if defined(CONFIG_ZINC_ARCH_ARM)
-#include <asm/system_info.h>
-#include <asm/cputype.h>
-#endif
-
-asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
-asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-
-static bool chacha20_use_neon __ro_after_init;
-static bool *const chacha20_nobs[] __initconst = { &chacha20_use_neon };
-static void __init chacha20_fpu_init(void)
-{
-#if defined(CONFIG_ZINC_ARCH_ARM64)
- chacha20_use_neon = cpu_have_named_feature(ASIMD);
-#elif defined(CONFIG_ZINC_ARCH_ARM)
- switch (read_cpuid_part()) {
- case ARM_CPU_PART_CORTEX_A7:
- case ARM_CPU_PART_CORTEX_A5:
- /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
- * implementation but do incredibly with the scalar one and use
- * less power.
- */
- break;
- default:
- chacha20_use_neon = elf_hwcap & HWCAP_NEON;
- }
-#endif
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE ||
- PAGE_SIZE % CHACHA20_BLOCK_SIZE);
-
- for (;;) {
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
- len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context)) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- chacha20_neon(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
- len -= bytes;
- if (!len)
- break;
- dst += bytes;
- src += bytes;
- simd_relax(simd_context);
- } else {
- chacha20_arm(dst, src, len, ctx->key, ctx->counter);
- ctx->counter[0] += (len + 63) / 64;
- break;
- }
- }
-
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM)) {
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
- hchacha20_arm(x, derived_key);
- return true;
- }
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
deleted file mode 100755
index 6785383ab7bb..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
+++ /dev/null
@@ -1,1227 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# December 2014
-#
-# ChaCha20 for ARMv4.
-#
-# September 2018
-#
-# Improve scalar performance per Eric Biggers' suggestion to eliminate
-# separate rotates. This requires b[0..3] and d[0..3] to be maintained
-# pre-rotated, hence odd twists prior inner loop and when accumulating
-# key material. Since amount of instructions is reduced as result, even
-# NEON performance is improved somewhat, most notably by ~9% on low-end
-# Cortex-A5/A7. Full unroll was shown to provide even better scalar
-# performance on Cortex-A5/A7, naturally at the cost of manyfold size
-# increase. We let it be. Oversized code works in benchmarks, but is not
-# necessarily optimal in real life, when it's likely to be out-of-cache
-# upon entry and evict significant part of cache upon completion.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
-#
-# Cortex-A5 14.2(*)/+160% 21.8 12.9(**)
-# Cortex-A8 10.2(*)/+190% 13.9 6.10
-# Cortex-A9 10.8(*)/+150% 14.3 6.50
-# Cortex-A15 11.0/+40% 16.0 4.90
-# Snapdragon S4 13.9(***)/+90% 13.6 4.90
-#
-# (*) most "favourable" result for aligned data on little-endian
-# processor, result for misaligned data is 10-15% lower;
-# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8%
-# better performance on Cortex-A5/A7, but not on others;
-# (***) it's 17% slower than original, trade-off is considered
-# acceptable, because of improvement on others, specifically
-# +36% on Cortex-A5/A7 and +20% on Cortex-A9;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
-my @t=map("r$_",(8..11));
-
-sub ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my $odd = $d0&1;
-my ($xc,$xc_) = (@t[0..1]);
-my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
-my @ret;
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' are permanently allocated in registers, @x[0..7],
- # while 'c's and pair of 'd's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end. If you observe 'd' column, you'll
- # notice that 15 and 13 are reused in next pair of rounds.
- # This is why these two are chosen for offloading to memory,
- # to make loads count more.
- push @ret,(
- "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')",
- "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')",
- "&eor ($xd,@x[$a0],$xd,'ror#24')",
- "&eor ($xd_,@x[$a1],$xd_,'ror#24')",
-
- "&add ($xc,$xc,$xd,'ror#16')",
- "&add ($xc_,$xc_,$xd_,'ror#16')",
- "&eor (@x[$b0],$xc, @x[$b0],'ror#13')",
- "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')",
-
- "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')",
- "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')",
- "&eor ($xd,@x[$a0],$xd,'ror#16')",
- "&eor ($xd_,@x[$a1],$xd_,'ror#16')" );
- push @ret,(
- "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd);
- push @ret,(
- "&add ($xc,$xc,$xd,'ror#24')" );
- push @ret,(
- "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
- push @ret,(
- "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd);
- push @ret,(
- "&add ($xc_,$xc_,$xd_,'ror#24')" );
- push @ret,(
- "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
- push @ret,(
- "&str ($xc,'[sp,#4*(16+$c0)]')",
- "&eor (@x[$b0],@x[$b0],$xc,'ror#12')",
- "&str ($xc_,'[sp,#4*(16+$c1)]')",
- "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" );
-
- $xd=@x[$d2] if (!$odd);
- $xd_=@x[$d3] if ($odd);
- push @ret,(
- "&ldr ($xc,'[sp,#4*(16+$c2)]')",
- "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')",
- "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
- "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')",
- "&eor ($xd,@x[$a2],$xd,'ror#24')",
- "&eor ($xd_,@x[$a3],$xd_,'ror#24')",
-
- "&add ($xc,$xc,$xd,'ror#16')",
- "&add ($xc_,$xc_,$xd_,'ror#16')",
- "&eor (@x[$b2],$xc, @x[$b2],'ror#13')",
- "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')",
-
- "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')",
- "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')",
- "&eor ($xd,@x[$a2],$xd,'ror#16')",
- "&eor ($xd_,@x[$a3],$xd_,'ror#16')",
-
- "&add ($xc,$xc,$xd,'ror#24')",
- "&add ($xc_,$xc_,$xd_,'ror#24')",
- "&eor (@x[$b2],@x[$b2],$xc,'ror#12')",
- "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" );
-
- @ret;
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define ChaCha20_ctr32 chacha20_arm_cryptogams
-# define ChaCha20_neon chacha20_neon
-#endif
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax unified
-# define ldrhsb ldrbhs
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-
-.align 5
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
-.Lone:
-.long 1,0,0,0
-.Lrot8:
-.long 0x02010003,0x06050407
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.LChaCha20_ctr32
-#else
-.word -1
-#endif
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
-.LChaCha20_ctr32:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r14,pc,#16 @ ChaCha20_ctr32
-#else
- adr r14,.LChaCha20_ctr32
-#endif
- cmp r2,#0 @ len==0?
-#ifdef __thumb2__
- itt eq
-#endif
- addeq sp,sp,#4*3
- beq .Lno_data
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- cmp r2,#192 @ test len
- bls .Lshort
- ldr r4,[r14,#-24]
- ldr r4,[r14,r4]
-# ifdef __APPLE__
- ldr r4,[r4]
-# endif
- tst r4,#ARMV7_NEON
- bne .LChaCha20_neon
-.Lshort:
-#endif
- ldmia r12,{r4-r7} @ load counter and nonce
- sub sp,sp,#4*(16) @ off-load area
- sub r14,r14,#64 @ .Lsigma
- stmdb sp!,{r4-r7} @ copy counter and nonce
- ldmia r3,{r4-r11} @ load key
- ldmia r14,{r0-r3} @ load sigma
- stmdb sp!,{r4-r11} @ copy key
- stmdb sp!,{r0-r3} @ copy sigma
- str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
- str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
- b .Loop_outer_enter
-
-.align 4
-.Loop_outer:
- ldmia sp,{r0-r9} @ load key material
- str @t[3],[sp,#4*(32+2)] @ save len
- str r12, [sp,#4*(32+1)] @ save inp
- str r14, [sp,#4*(32+0)] @ save out
-.Loop_outer_enter:
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(16+15)]
- mov @t[3],#10
- b .Loop
-
-.align 4
-.Loop:
- subs @t[3],@t[3],#1
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- bne .Loop
-
- ldr @t[3],[sp,#4*(32+2)] @ load len
-
- str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
- str @t[1], [sp,#4*(16+9)]
- str @x[12],[sp,#4*(16+12)]
- str @t[2], [sp,#4*(16+13)]
- str @x[14],[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ @x[0-7] and second half at sp+4*(16+8)
-
- cmp @t[3],#64 @ done yet?
-#ifdef __thumb2__
- itete lo
-#endif
- addlo r12,sp,#4*(0) @ shortcut or ...
- ldrhs r12,[sp,#4*(32+1)] @ ... load inp
- addlo r14,sp,#4*(0) @ shortcut or ...
- ldrhs r14,[sp,#4*(32+0)] @ ... load out
-
- ldr @t[0],[sp,#4*(0)] @ load key material
- ldr @t[1],[sp,#4*(1)]
-
-#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH__<7
- orr @t[2],r12,r14
- tst @t[2],#3 @ are input and output aligned?
- ldr @t[2],[sp,#4*(2)]
- bne .Lunaligned
- cmp @t[3],#64 @ restore flags
-# else
- ldr @t[2],[sp,#4*(2)]
-# endif
- ldr @t[3],[sp,#4*(3)]
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @x[1],@x[1],@t[1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
-
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[0],@x[0],@t[0] @ xor with input
- eorhs @x[1],@x[1],@t[1]
- add @t[0],sp,#4*(4)
- str @x[0],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[2],@x[2],@t[2]
- eorhs @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[1],[r14,#-12]
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- add @x[5],@t[1],@x[5],ror#13
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#13
- add @x[7],@t[3],@x[7],ror#13
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[4],@x[4],@t[0]
- eorhs @x[5],@x[5],@t[1]
- add @t[0],sp,#4*(8)
- str @x[4],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[6],@x[6],@t[2]
- eorhs @x[7],@x[7],@t[3]
- str @x[5],[r14,#-12]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[6],[r14,#-8]
- add @x[0],sp,#4*(16+8)
- str @x[7],[r14,#-4]
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @x[1],@x[1],@t[1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
-# ifdef __thumb2__
- itt hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[0],@x[0],@t[0]
- eorhs @x[1],@x[1],@t[1]
- add @t[0],sp,#4*(12)
- str @x[0],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[2],@x[2],@t[2]
- eorhs @x[3],@x[3],@t[3]
- str @x[1],[r14,#-12]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @x[5],@t[1],@x[5],ror#24
-# ifdef __thumb2__
- itt hi
-# endif
- addhi @t[0],@t[0],#1 @ next counter value
- strhi @t[0],[sp,#4*(12)] @ save next counter value
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#24
- add @x[7],@t[3],@x[7],ror#24
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[4],@x[4],@t[0]
- eorhs @x[5],@x[5],@t[1]
-# ifdef __thumb2__
- it ne
-# endif
- ldrne @t[0],[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[6],@x[6],@t[2]
- eorhs @x[7],@x[7],@t[3]
- str @x[4],[r14],#16 @ store output
- str @x[5],[r14,#-12]
-# ifdef __thumb2__
- it hs
-# endif
- subhs @t[3],@t[0],#64 @ len-=64
- str @x[6],[r14,#-8]
- str @x[7],[r14,#-4]
- bhi .Loop_outer
-
- beq .Ldone
-# if __ARM_ARCH__<7
- b .Ltail
-
-.align 4
-.Lunaligned: @ unaligned endian-neutral path
- cmp @t[3],#64 @ restore flags
-# endif
-#endif
-#if __ARM_ARCH__<7
- ldr @t[3],[sp,#4*(3)]
-___
-for ($i=0;$i<16;$i+=4) {
-my $j=$i&0x7;
-my $twist="";
-if ($i==4) { $twist = ",ror#13"; }
-elsif ($i==12) { $twist = ",ror#24"; }
-
-$code.=<<___ if ($i==4);
- add @x[0],sp,#4*(16+8)
-___
-$code.=<<___ if ($i==8);
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-# ifdef __thumb2__
- itt hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
-___
-$code.=<<___;
- add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material
-___
-$code.=<<___ if ($i==12);
-# ifdef __thumb2__
- itt hi
-# endif
- addhi @t[0],@t[0],#1 @ next counter value
- strhi @t[0],[sp,#4*(12)] @ save next counter value
-___
-$code.=<<___;
- add @x[$j+1],@t[1],@x[$j+1]$twist
- add @x[$j+2],@t[2],@x[$j+2]$twist
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo @t[0],@t[0],@t[0] @ zero or ...
- ldrhsb @t[0],[r12],#16 @ ... load input
- eorlo @t[1],@t[1],@t[1]
- ldrhsb @t[1],[r12,#-12]
-
- add @x[$j+3],@t[3],@x[$j+3]$twist
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo @t[2],@t[2],@t[2]
- ldrhsb @t[2],[r12,#-8]
- eorlo @t[3],@t[3],@t[3]
- ldrhsb @t[3],[r12,#-4]
-
- eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
- eor @x[$j+1],@t[1],@x[$j+1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-15] @ load more input
- ldrhsb @t[1],[r12,#-11]
- eor @x[$j+2],@t[2],@x[$j+2]
- strb @x[$j+0],[r14],#16 @ store output
- eor @x[$j+3],@t[3],@x[$j+3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-7]
- ldrhsb @t[3],[r12,#-3]
- strb @x[$j+1],[r14,#-12]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+2],[r14,#-8]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-14] @ load more input
- ldrhsb @t[1],[r12,#-10]
- strb @x[$j+3],[r14,#-4]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+0],[r14,#-15]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-6]
- ldrhsb @t[3],[r12,#-2]
- strb @x[$j+1],[r14,#-11]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+2],[r14,#-7]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-13] @ load more input
- ldrhsb @t[1],[r12,#-9]
- strb @x[$j+3],[r14,#-3]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+0],[r14,#-14]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-5]
- ldrhsb @t[3],[r12,#-1]
- strb @x[$j+1],[r14,#-10]
- strb @x[$j+2],[r14,#-6]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+3],[r14,#-2]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
- strb @x[$j+0],[r14,#-13]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+1],[r14,#-9]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
- strb @x[$j+2],[r14,#-5]
- strb @x[$j+3],[r14,#-1]
-___
-$code.=<<___ if ($i<12);
- add @t[0],sp,#4*(4+$i)
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-___
-}
-$code.=<<___;
-# ifdef __thumb2__
- it ne
-# endif
- ldrne @t[0],[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- it hs
-# endif
- subhs @t[3],@t[0],#64 @ len-=64
- bhi .Loop_outer
-
- beq .Ldone
-#endif
-
-.Ltail:
- ldr r12,[sp,#4*(32+1)] @ load inp
- add @t[1],sp,#4*(0)
- ldr r14,[sp,#4*(32+0)] @ load out
-
-.Loop_tail:
- ldrb @t[2],[@t[1]],#1 @ read buffer on stack
- ldrb @t[3],[r12],#1 @ read input
- subs @t[0],@t[0],#1
- eor @t[3],@t[3],@t[2]
- strb @t[3],[r14],#1 @ store output
- bne .Loop_tail
-
-.Ldone:
- add sp,sp,#4*(32+3)
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .long 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-___
-
-{{{
-my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
- map("q$_",(0..15));
-
-# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
-# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
-sub vperm()
-{ my ($dst,$src,$tbl) = @_;
- $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n";
- $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n";
-}
-
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
-
- (
- "&vadd_i32 ($a,$a,$b)",
- "&veor ($d,$d,$a)",
- "&vrev32_16 ($d,$d)", # vrot ($d,16)
-
- "&vadd_i32 ($c,$c,$d)",
- "&veor ($t,$b,$c)",
- "&vshr_u32 ($b,$t,20)",
- "&vsli_32 ($b,$t,12)",
-
- "&vadd_i32 ($a,$a,$b)",
- "&veor ($t,$d,$a)",
- "&vshr_u32 ($d,$t,24)",
- "&vsli_32 ($d,$t,8)",
- #"&vperm ($d,$t,$t3)",
-
- "&vadd_i32 ($c,$c,$d)",
- "&veor ($t,$b,$c)",
- "&vshr_u32 ($b,$t,25)",
- "&vsli_32 ($b,$t,7)",
-
- "&vext_8 ($a,$a,$a,$odd?4:12)",
- "&vext_8 ($d,$d,$d,8)",
- "&vext_8 ($c,$c,$c,$odd?12:4)"
- );
-}
-
-$code.=<<___;
-#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
-.arch armv7-a
-.fpu neon
-
-# ifdef __KERNEL__
-.globl ChaCha20_neon
-@ For optimal performance it's appropriate for caller to enforce
-@ minimum input length, 193 bytes is suggested.
-# endif
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
- adr r14,.Lsigma
- vstmdb sp!,{d8-d15} @ ABI spec says so
- stmdb sp!,{r0-r3}
-
- vld1.32 {$b0-$c0},[r3] @ load key
- ldmia r3,{r4-r11} @ load key
-
- sub sp,sp,#4*(16+16)
- vld1.32 {$d0},[r12] @ load counter and nonce
- add r12,sp,#4*8
- ldmia r14,{r0-r3} @ load sigma
- vld1.32 {$a0},[r14]! @ load sigma
- vld1.32 {$t0},[r14]! @ one
- @ vld1.32 {$t3#lo},[r14] @ rot8
- vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
- vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
-
- str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
- str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
- vshl.i32 $t1#lo,$t0#lo,#1 @ two
- vstr $t0#lo,[sp,#4*(16+0)]
- vshl.i32 $t2#lo,$t0#lo,#2 @ four
- vstr $t1#lo,[sp,#4*(16+2)]
- vmov $a1,$a0
- vstr $t2#lo,[sp,#4*(16+4)]
- vmov $a2,$a0
- @ vstr $t3#lo,[sp,#4*(16+6)]
- vmov $b1,$b0
- vmov $b2,$b0
- b .Loop_neon_enter
-
-.align 4
-.Loop_neon_outer:
- ldmia sp,{r0-r9} @ load key material
- cmp @t[3],#64*2 @ if len<=64*2
- bls .Lbreak_neon @ switch to integer-only
- @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8
- vmov $a1,$a0
- str @t[3],[sp,#4*(32+2)] @ save len
- vmov $a2,$a0
- str r12, [sp,#4*(32+1)] @ save inp
- vmov $b1,$b0
- str r14, [sp,#4*(32+0)] @ save out
- vmov $b2,$b0
-.Loop_neon_enter:
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- vadd.i32 $d1,$d0,$t0 @ counter+1
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- vmov $c1,$c0
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- vmov $c2,$c0
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- vadd.i32 $d2,$d1,$t0 @ counter+2
- add @x[12],@x[12],#3 @ counter+3
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(16+15)]
- mov @t[3],#10
- b .Loop_neon
-
-.align 4
-.Loop_neon:
- subs @t[3],@t[3],#1
-___
- my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
- my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
- my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
- @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
- @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
- @thread3=&ROUND(0,5,10,15);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-$code.=<<___;
- bne .Loop_neon
-
- add @t[3],sp,#32
- vld1.32 {$t0-$t1},[sp] @ load key material
- vld1.32 {$t2-$t3},[@t[3]]
-
- ldr @t[3],[sp,#4*(32+2)] @ load len
-
- str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
- str @t[1], [sp,#4*(16+9)]
- str @x[12],[sp,#4*(16+12)]
- str @t[2], [sp,#4*(16+13)]
- str @x[14],[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ @x[0-7] and second half at sp+4*(16+8)
-
- ldr r12,[sp,#4*(32+1)] @ load inp
- ldr r14,[sp,#4*(32+0)] @ load out
-
- vadd.i32 $a0,$a0,$t0 @ accumulate key material
- vadd.i32 $a1,$a1,$t0
- vadd.i32 $a2,$a2,$t0
- vldr $t0#lo,[sp,#4*(16+0)] @ one
-
- vadd.i32 $b0,$b0,$t1
- vadd.i32 $b1,$b1,$t1
- vadd.i32 $b2,$b2,$t1
- vldr $t1#lo,[sp,#4*(16+2)] @ two
-
- vadd.i32 $c0,$c0,$t2
- vadd.i32 $c1,$c1,$t2
- vadd.i32 $c2,$c2,$t2
- vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
- vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
-
- vadd.i32 $d0,$d0,$t3
- vadd.i32 $d1,$d1,$t3
- vadd.i32 $d2,$d2,$t3
-
- cmp @t[3],#64*4
- blo .Ltail_neon
-
- vld1.8 {$t0-$t1},[r12]! @ load input
- mov @t[3],sp
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0 @ xor with input
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- vst1.8 {$a0-$b0},[r14]! @ store output
- veor $b1,$b1,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c1,$c1,$t2
- vst1.8 {$c0-$d0},[r14]!
- veor $d1,$d1,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a2,$a2,$t0
- vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
- veor $t0#hi,$t0#hi,$t0#hi
- vldr $t0#lo,[sp,#4*(16+4)] @ four
- veor $b2,$b2,$t1
- vld1.32 {$c0-$d0},[@t[3]]
- veor $c2,$c2,$t2
- vst1.8 {$a1-$b1},[r14]!
- veor $d2,$d2,$t3
- vst1.8 {$c1-$d1},[r14]!
-
- vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
- vldr $t0#lo,[sp,#4*(16+0)] @ one
-
- ldmia sp,{@t[0]-@t[3]} @ load key material
- add @x[0],@x[0],@t[0] @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- vst1.8 {$a2-$b2},[r14]!
- add @x[1],@x[1],@t[1]
- ldr @t[1],[r12,#-12]
- vst1.8 {$c2-$d2},[r14]!
- add @x[2],@x[2],@t[2]
- ldr @t[2],[r12,#-8]
- add @x[3],@x[3],@t[3]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
- eor @x[0],@x[0],@t[0] @ xor with input
- add @t[0],sp,#4*(4)
- eor @x[1],@x[1],@t[1]
- str @x[0],[r14],#16 @ store output
- eor @x[2],@x[2],@t[2]
- str @x[1],[r14,#-12]
- eor @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- add @x[5],@t[1],@x[5],ror#13
- ldr @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#13
- ldr @t[2],[r12,#-8]
- add @x[7],@t[3],@x[7],ror#13
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- eor @x[4],@x[4],@t[0]
- add @t[0],sp,#4*(8)
- eor @x[5],@x[5],@t[1]
- str @x[4],[r14],#16 @ store output
- eor @x[6],@x[6],@t[2]
- str @x[5],[r14,#-12]
- eor @x[7],@x[7],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[6],[r14,#-8]
- add @x[0],sp,#4*(16+8)
- str @x[7],[r14,#-4]
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- add @x[1],@x[1],@t[1]
- ldr @t[1],[r12,#-12]
-# ifdef __thumb2__
- it hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
- add @x[2],@x[2],@t[2]
- ldr @t[2],[r12,#-8]
-# ifdef __thumb2__
- it hi
-# endif
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
- add @x[3],@x[3],@t[3]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
- eor @x[0],@x[0],@t[0]
- add @t[0],sp,#4*(12)
- eor @x[1],@x[1],@t[1]
- str @x[0],[r14],#16 @ store output
- eor @x[2],@x[2],@t[2]
- str @x[1],[r14,#-12]
- eor @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @t[0],@t[0],#4 @ next counter value
- add @x[5],@t[1],@x[5],ror#24
- str @t[0],[sp,#4*(12)] @ save next counter value
- ldr @t[0],[r12],#16 @ load input
- add @x[6],@t[2],@x[6],ror#24
- add @x[4],@x[4],#3 @ counter+3
- ldr @t[1],[r12,#-12]
- add @x[7],@t[3],@x[7],ror#24
- ldr @t[2],[r12,#-8]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- eor @x[4],@x[4],@t[0]
-# ifdef __thumb2__
- it hi
-# endif
- ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
- eor @x[5],@x[5],@t[1]
- eor @x[6],@x[6],@t[2]
- str @x[4],[r14],#16 @ store output
- eor @x[7],@x[7],@t[3]
- str @x[5],[r14,#-12]
- sub @t[3],@t[0],#64*4 @ len-=64*4
- str @x[6],[r14,#-8]
- str @x[7],[r14,#-4]
- bhi .Loop_neon_outer
-
- b .Ldone_neon
-
-.align 4
-.Lbreak_neon:
- @ harmonize NEON and integer-only stack frames: load data
- @ from NEON frame, but save to integer-only one; distance
- @ between the two is 4*(32+4+16-32)=4*(20).
-
- str @t[3], [sp,#4*(20+32+2)] @ save len
- add @t[3],sp,#4*(32+4)
- str r12, [sp,#4*(20+32+1)] @ save inp
- str r14, [sp,#4*(20+32+0)] @ save out
-
- ldr @x[12],[sp,#4*(16+10)]
- ldr @x[14],[sp,#4*(16+11)]
- vldmia @t[3],{d8-d15} @ fulfill ABI requirement
- str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
- str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
-
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(20+16+15)]
- add @t[3],sp,#4*(20)
- vst1.32 {$a0-$b0},[@t[3]]! @ copy key
- add sp,sp,#4*(20) @ switch frame
- vst1.32 {$c0-$d0},[@t[3]]
- mov @t[3],#10
- b .Loop @ go integer-only
-
-.align 4
-.Ltail_neon:
- cmp @t[3],#64*3
- bhs .L192_or_more_neon
- cmp @t[3],#64*2
- bhs .L128_or_more_neon
- cmp @t[3],#64*1
- bhs .L64_or_more_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a0-$b0},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c0-$d0},[@t[0]]
- b .Loop_tail_neon
-
-.align 4
-.L64_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vst1.8 {$a0-$b0},[r14]!
- vst1.8 {$c0-$d0},[r14]!
-
- beq .Ldone_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a1-$b1},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c1-$d1},[@t[0]]
- sub @t[3],@t[3],#64*1 @ len-=64*1
- b .Loop_tail_neon
-
-.align 4
-.L128_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- veor $b1,$b1,$t1
- vst1.8 {$a0-$b0},[r14]!
- veor $c1,$c1,$t2
- vst1.8 {$c0-$d0},[r14]!
- veor $d1,$d1,$t3
- vst1.8 {$a1-$b1},[r14]!
- vst1.8 {$c1-$d1},[r14]!
-
- beq .Ldone_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a2-$b2},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c2-$d2},[@t[0]]
- sub @t[3],@t[3],#64*2 @ len-=64*2
- b .Loop_tail_neon
-
-.align 4
-.L192_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- veor $b1,$b1,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c1,$c1,$t2
- vst1.8 {$a0-$b0},[r14]!
- veor $d1,$d1,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a2,$a2,$t0
- vst1.8 {$c0-$d0},[r14]!
- veor $b2,$b2,$t1
- vst1.8 {$a1-$b1},[r14]!
- veor $c2,$c2,$t2
- vst1.8 {$c1-$d1},[r14]!
- veor $d2,$d2,$t3
- vst1.8 {$a2-$b2},[r14]!
- vst1.8 {$c2-$d2},[r14]!
-
- beq .Ldone_neon
-
- ldmia sp,{@t[0]-@t[3]} @ load key material
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @t[0],sp,#4*(4)
- add @x[1],@x[1],@t[1]
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- add @t[0],sp,#4*(8)
- add @x[5],@t[1],@x[5],ror#13
- add @x[6],@t[2],@x[6],ror#13
- add @x[7],@t[3],@x[7],ror#13
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- stmia sp,{@x[0]-@x[7]}
- add @x[0],sp,#4*(16+8)
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @t[0],sp,#4*(12)
- add @x[1],@x[1],@t[1]
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @t[0],sp,#4*(8)
- add @x[5],@t[1],@x[5],ror#24
- add @x[4],@x[4],#3 @ counter+3
- add @x[6],@t[2],@x[6],ror#24
- add @x[7],@t[3],@x[7],ror#24
- ldr @t[3],[sp,#4*(32+2)] @ re-load len
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- stmia @t[0],{@x[0]-@x[7]}
- add @t[2],sp,#4*(0)
- sub @t[3],@t[3],#64*3 @ len-=64*3
-
-.Loop_tail_neon:
- ldrb @t[0],[@t[2]],#1 @ read buffer on stack
- ldrb @t[1],[r12],#1 @ read input
- subs @t[3],@t[3],#1
- eor @t[0],@t[0],@t[1]
- strb @t[0],[r14],#1 @ store output
- bne .Loop_tail_neon
-
-.Ldone_neon:
- add sp,sp,#4*(32+4)
- vldmia sp,{d8-d15}
- add sp,sp,#4*(16+3)
- ldmia sp!,{r4-r11,pc}
-.size ChaCha20_neon,.-ChaCha20_neon
-# ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-# endif
-#endif
-___
-}}}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl
deleted file mode 100755
index ac14a9924165..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl
+++ /dev/null
@@ -1,1163 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# June 2015
-#
-# ChaCha20 for ARMv8.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*)
-#
-# Apple A7 5.50/+49% 3.33 1.70
-# Cortex-A53 8.40/+80% 4.72 4.72(**)
-# Cortex-A57 8.06/+43% 4.90 4.43(***)
-# Denver 4.50/+82% 2.63 2.67(**)
-# X-Gene 9.50/+46% 8.82 8.89(**)
-# Mongoose 8.00/+44% 3.64 3.25(***)
-# Kryo 8.17/+50% 4.83 4.65(***)
-#
-# (*) since no non-Apple processor exhibits significantly better
-# performance, the code path is #ifdef __APPLE__-ed;
-# (**) it's expected that doubling interleave factor doesn't help
-# all processors, only those with higher NEON latency and
-# higher instruction issue rate;
-# (***) expected improvement was actually higher;
-
-$flavour=shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
-
-my @x=map("x$_",(5..17,19..21));
-my @d=map("x$_",(22..28,30));
-
-sub ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-
- (
- "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
- "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
- "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
- "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
- "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
- "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
- "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
- "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
- "&ror_32 (@x[$d0],@x[$d0],16)",
- "&ror_32 (@x[$d1],@x[$d1],16)",
- "&ror_32 (@x[$d2],@x[$d2],16)",
- "&ror_32 (@x[$d3],@x[$d3],16)",
-
- "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
- "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
- "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
- "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
- "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
- "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
- "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
- "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
- "&ror_32 (@x[$b0],@x[$b0],20)",
- "&ror_32 (@x[$b1],@x[$b1],20)",
- "&ror_32 (@x[$b2],@x[$b2],20)",
- "&ror_32 (@x[$b3],@x[$b3],20)",
-
- "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
- "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
- "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
- "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
- "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
- "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
- "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
- "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
- "&ror_32 (@x[$d0],@x[$d0],24)",
- "&ror_32 (@x[$d1],@x[$d1],24)",
- "&ror_32 (@x[$d2],@x[$d2],24)",
- "&ror_32 (@x[$d3],@x[$d3],24)",
-
- "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
- "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
- "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
- "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
- "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
- "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
- "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
- "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
- "&ror_32 (@x[$b0],@x[$b0],25)",
- "&ror_32 (@x[$b1],@x[$b1],25)",
- "&ror_32 (@x[$b2],@x[$b2],25)",
- "&ror_32 (@x[$b3],@x[$b3],25)"
- );
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#else
-# define ChaCha20_ctr32 chacha20_arm
-# define ChaCha20_neon chacha20_neon
-#endif
-
-.text
-
-.align 5
-.Lsigma:
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
-.Lone:
-.long 1,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-# else
-.quad OPENSSL_armcap_P-.
-# endif
-#endif
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
- cbz $len,.Labort
-#ifndef __KERNEL__
- adr @x[0],.LOPENSSL_armcap_P
- cmp $len,#192
- b.lo .Lshort
-# ifdef __ILP32__
- ldrsw @x[1],[@x[0]]
-# else
- ldr @x[1],[@x[0]]
-# endif
- ldr w17,[@x[1],@x[0]]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-
-.Lshort:
-#endif
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ldp @d[6],@d[7],[$ctr] // load counter
-#ifdef __AARCH64EB__
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-#endif
-
-.Loop_outer:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov.32 @x[6],@d[3]
- lsr @x[7],@d[3],#32
- mov.32 @x[8],@d[4]
- lsr @x[9],@d[4],#32
- mov.32 @x[10],@d[5]
- lsr @x[11],@d[5],#32
- mov.32 @x[12],@d[6]
- lsr @x[13],@d[6],#32
- mov.32 @x[14],@d[7]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#10
- subs $len,$len,#64
-.Loop:
- sub $ctr,$ctr,#1
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- cbnz $ctr,.Loop
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add @x[1],@x[1],@d[0],lsr#32
- add.32 @x[2],@x[2],@d[1]
- add @x[3],@x[3],@d[1],lsr#32
- add.32 @x[4],@x[4],@d[2]
- add @x[5],@x[5],@d[2],lsr#32
- add.32 @x[6],@x[6],@d[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add @x[15],@x[15],@d[7],lsr#32
-
- b.lo .Ltail
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#1 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
-
- b.hi .Loop_outer
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.Labort:
- ret
-
-.align 4
-.Ltail:
- add $len,$len,#64
-.Less_than_64:
- sub $out,$out,#1
- add $inp,$inp,$len
- add $out,$out,$len
- add $ctr,sp,$len
- neg $len,$len
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- stp @x[0],@x[2],[sp,#0]
- stp @x[4],@x[6],[sp,#16]
- stp @x[8],@x[10],[sp,#32]
- stp @x[12],@x[14],[sp,#48]
-
-.Loop_tail:
- ldrb w10,[$inp,$len]
- ldrb w11,[$ctr,$len]
- add $len,$len,#1
- eor w10,w10,w11
- strb w10,[$out,$len]
- cbnz $len,.Loop_tail
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-___
-
-{{{
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
- map("v$_.4s",(0..7,16..23));
-my (@K)=map("v$_.4s",(24..30));
-my $ONE="v31.4s";
-
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
-
- (
- "&add ('$a','$a','$b')",
- "&eor ('$d','$d','$a')",
- "&rev32_16 ('$d','$d')", # vrot ($d,16)
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',20)",
- "&sli ('$b','$t',12)",
-
- "&add ('$a','$a','$b')",
- "&eor ('$t','$d','$a')",
- "&ushr ('$d','$t',24)",
- "&sli ('$d','$t',8)",
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',25)",
- "&sli ('$b','$t',7)",
-
- "&ext ('$a','$a','$a',$odd?4:12)",
- "&ext ('$d','$d','$d',8)",
- "&ext ('$c','$c','$c',$odd?12:4)"
- );
-}
-
-$code.=<<___;
-#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON)
-#ifdef __KERNEL__
-.globl ChaCha20_neon
-.type ChaCha20_neon,%function
-#endif
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-#ifdef __APPLE__
- cmp $len,#512
- b.hs .L512_or_more_neon
-#endif
-
- sub sp,sp,#64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ld1 {@K[0]},[@x[0]],#16
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ld1 {@K[1],@K[2]},[$key]
- ldp @d[6],@d[7],[$ctr] // load counter
- ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-#ifdef __AARCH64EB__
- rev64 @K[0],@K[0]
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-#endif
- add @K[3],@K[3],$ONE // += 1
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
-
-.Loop_outer_neon:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov $A0,@K[0]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov $A1,@K[0]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov $A2,@K[0]
- mov.32 @x[6],@d[3]
- mov $B0,@K[1]
- lsr @x[7],@d[3],#32
- mov $B1,@K[1]
- mov.32 @x[8],@d[4]
- mov $B2,@K[1]
- lsr @x[9],@d[4],#32
- mov $D0,@K[3]
- mov.32 @x[10],@d[5]
- mov $D1,@K[4]
- lsr @x[11],@d[5],#32
- mov $D2,@K[5]
- mov.32 @x[12],@d[6]
- mov $C0,@K[2]
- lsr @x[13],@d[6],#32
- mov $C1,@K[2]
- mov.32 @x[14],@d[7]
- mov $C2,@K[2]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#10
- subs $len,$len,#256
-.Loop_neon:
- sub $ctr,$ctr,#1
-___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&ROUND(0,5,10,15);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add $A0,$A0,@K[0]
- add @x[1],@x[1],@d[0],lsr#32
- add $A1,$A1,@K[0]
- add.32 @x[2],@x[2],@d[1]
- add $A2,$A2,@K[0]
- add @x[3],@x[3],@d[1],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[4],@x[4],@d[2]
- add $C1,$C1,@K[2]
- add @x[5],@x[5],@d[2],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[6],@x[6],@d[3]
- add $D0,$D0,@K[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add $D1,$D1,@K[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add $D2,$D2,@K[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add $B0,$B0,@K[1]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add $B1,$B1,@K[1]
- add @x[15],@x[15],@d[7],lsr#32
- add $B2,$B2,@K[1]
-
- b.lo .Ltail_neon
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- ld1.8 {$T0-$T3},[$inp],#64
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
- eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
- eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
- eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
- stp @x[4],@x[6],[$out,#16]
- add @K[3],@K[3],$ONE // += 4
- stp @x[8],@x[10],[$out,#32]
- add @K[4],@K[4],$ONE
- stp @x[12],@x[14],[$out,#48]
- add @K[5],@K[5],$ONE
- add $out,$out,#64
-
- st1.8 {$A0-$D0},[$out],#64
- ld1.8 {$A0-$D0},[$inp],#64
-
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- eor $A2,$A2,$A0
- eor $B2,$B2,$B0
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
-
- b.hi .Loop_outer_neon
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-
-.Ltail_neon:
- add $len,$len,#256
- cmp $len,#64
- b.lo .Less_than_64
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- b.eq .Ldone_neon
- sub $len,$len,#64
- cmp $len,#64
- b.lo .Less_than_128
-
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A0,$A0,$T0
- eor $B0,$B0,$T1
- eor $C0,$C0,$T2
- eor $D0,$D0,$T3
- st1.8 {$A0-$D0},[$out],#64
- b.eq .Ldone_neon
- sub $len,$len,#64
- cmp $len,#64
- b.lo .Less_than_192
-
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
- b.eq .Ldone_neon
- sub $len,$len,#64
-
- st1.8 {$A2-$D2},[sp]
- b .Last_neon
-
-.Less_than_128:
- st1.8 {$A0-$D0},[sp]
- b .Last_neon
-.Less_than_192:
- st1.8 {$A1-$D1},[sp]
- b .Last_neon
-
-.align 4
-.Last_neon:
- sub $out,$out,#1
- add $inp,$inp,$len
- add $out,$out,$len
- add $ctr,sp,$len
- neg $len,$len
-
-.Loop_tail_neon:
- ldrb w10,[$inp,$len]
- ldrb w11,[$ctr,$len]
- add $len,$len,#1
- eor w10,w10,w11
- strb w10,[$out,$len]
- cbnz $len,.Loop_tail_neon
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
-.Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_neon,.-ChaCha20_neon
-___
-{
-my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
- $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
-
-$code.=<<___;
-#ifdef __APPLE__
-.type ChaCha20_512_neon,%function
-.align 5
-ChaCha20_512_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
-.L512_or_more_neon:
- sub sp,sp,#128+64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ld1 {@K[0]},[@x[0]],#16
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ld1 {@K[1],@K[2]},[$key]
- ldp @d[6],@d[7],[$ctr] // load counter
- ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-# ifdef __AARCH64EB__
- rev64 @K[0],@K[0]
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-# endif
- add @K[3],@K[3],$ONE // += 1
- stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
- add @K[3],@K[3],$ONE // not typo
- str @K[2],[sp,#32]
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- add @K[6],@K[5],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
-
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
-
- sub $len,$len,#512 // not typo
-
-.Loop_outer_512_neon:
- mov $A0,@K[0]
- mov $A1,@K[0]
- mov $A2,@K[0]
- mov $A3,@K[0]
- mov $A4,@K[0]
- mov $A5,@K[0]
- mov $B0,@K[1]
- mov.32 @x[0],@d[0] // unpack key block
- mov $B1,@K[1]
- lsr @x[1],@d[0],#32
- mov $B2,@K[1]
- mov.32 @x[2],@d[1]
- mov $B3,@K[1]
- lsr @x[3],@d[1],#32
- mov $B4,@K[1]
- mov.32 @x[4],@d[2]
- mov $B5,@K[1]
- lsr @x[5],@d[2],#32
- mov $D0,@K[3]
- mov.32 @x[6],@d[3]
- mov $D1,@K[4]
- lsr @x[7],@d[3],#32
- mov $D2,@K[5]
- mov.32 @x[8],@d[4]
- mov $D3,@K[6]
- lsr @x[9],@d[4],#32
- mov $C0,@K[2]
- mov.32 @x[10],@d[5]
- mov $C1,@K[2]
- lsr @x[11],@d[5],#32
- add $D4,$D0,$ONE // +4
- mov.32 @x[12],@d[6]
- add $D5,$D1,$ONE // +4
- lsr @x[13],@d[6],#32
- mov $C2,@K[2]
- mov.32 @x[14],@d[7]
- mov $C3,@K[2]
- lsr @x[15],@d[7],#32
- mov $C4,@K[2]
- stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
- mov $C5,@K[2]
- str @K[5],[sp,#80]
-
- mov $ctr,#5
- subs $len,$len,#512
-.Loop_upper_neon:
- sub $ctr,$ctr,#1
-___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
- my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
- my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
- my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
- my $diff = ($#thread0+1)*6 - $#thread67 - 1;
- my $i = 0;
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_upper_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add @x[1],@x[1],@d[0],lsr#32
- add.32 @x[2],@x[2],@d[1]
- add @x[3],@x[3],@d[1],lsr#32
- add.32 @x[4],@x[4],@d[2]
- add @x[5],@x[5],@d[2],lsr#32
- add.32 @x[6],@x[6],@d[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add @x[15],@x[15],@d[7],lsr#32
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-# ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-# endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#1 // increment counter
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- stp @x[4],@x[6],[$out,#16]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- stp @x[8],@x[10],[$out,#32]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- mov.32 @x[6],@d[3]
- lsr @x[7],@d[3],#32
- mov.32 @x[8],@d[4]
- lsr @x[9],@d[4],#32
- mov.32 @x[10],@d[5]
- lsr @x[11],@d[5],#32
- mov.32 @x[12],@d[6]
- lsr @x[13],@d[6],#32
- mov.32 @x[14],@d[7]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#5
-.Loop_lower_neon:
- sub $ctr,$ctr,#1
-___
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_lower_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- ldp @K[0],@K[1],[sp,#0]
- add @x[1],@x[1],@d[0],lsr#32
- ldp @K[2],@K[3],[sp,#32]
- add.32 @x[2],@x[2],@d[1]
- ldp @K[4],@K[5],[sp,#64]
- add @x[3],@x[3],@d[1],lsr#32
- add $A0,$A0,@K[0]
- add.32 @x[4],@x[4],@d[2]
- add $A1,$A1,@K[0]
- add @x[5],@x[5],@d[2],lsr#32
- add $A2,$A2,@K[0]
- add.32 @x[6],@x[6],@d[3]
- add $A3,$A3,@K[0]
- add @x[7],@x[7],@d[3],lsr#32
- add $A4,$A4,@K[0]
- add.32 @x[8],@x[8],@d[4]
- add $A5,$A5,@K[0]
- add @x[9],@x[9],@d[4],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[10],@x[10],@d[5]
- add $C1,$C1,@K[2]
- add @x[11],@x[11],@d[5],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[12],@x[12],@d[6]
- add $C3,$C3,@K[2]
- add @x[13],@x[13],@d[6],lsr#32
- add $C4,$C4,@K[2]
- add.32 @x[14],@x[14],@d[7]
- add $C5,$C5,@K[2]
- add @x[15],@x[15],@d[7],lsr#32
- add $D4,$D4,$ONE // +4
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add $D5,$D5,$ONE // +4
- add @x[2],@x[2],@x[3],lsl#32
- add $D0,$D0,@K[3]
- ldp @x[1],@x[3],[$inp,#0] // load input
- add $D1,$D1,@K[4]
- add @x[4],@x[4],@x[5],lsl#32
- add $D2,$D2,@K[5]
- add @x[6],@x[6],@x[7],lsl#32
- add $D3,$D3,@K[6]
- ldp @x[5],@x[7],[$inp,#16]
- add $D4,$D4,@K[3]
- add @x[8],@x[8],@x[9],lsl#32
- add $D5,$D5,@K[4]
- add @x[10],@x[10],@x[11],lsl#32
- add $B0,$B0,@K[1]
- ldp @x[9],@x[11],[$inp,#32]
- add $B1,$B1,@K[1]
- add @x[12],@x[12],@x[13],lsl#32
- add $B2,$B2,@K[1]
- add @x[14],@x[14],@x[15],lsl#32
- add $B3,$B3,@K[1]
- ldp @x[13],@x[15],[$inp,#48]
- add $B4,$B4,@K[1]
- add $inp,$inp,#64
- add $B5,$B5,@K[1]
-
-# ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-# endif
- ld1.8 {$T0-$T3},[$inp],#64
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
- eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
- eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
- eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#7 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- st1.8 {$A0-$D0},[$out],#64
-
- ld1.8 {$A0-$D0},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- ld1.8 {$A1-$D1},[$inp],#64
- eor $A2,$A2,$A0
- ldp @K[0],@K[1],[sp,#0]
- eor $B2,$B2,$B0
- ldp @K[2],@K[3],[sp,#32]
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
-
- ld1.8 {$A2-$D2},[$inp],#64
- eor $A3,$A3,$A1
- eor $B3,$B3,$B1
- eor $C3,$C3,$C1
- eor $D3,$D3,$D1
- st1.8 {$A3-$D3},[$out],#64
-
- ld1.8 {$A3-$D3},[$inp],#64
- eor $A4,$A4,$A2
- eor $B4,$B4,$B2
- eor $C4,$C4,$C2
- eor $D4,$D4,$D2
- st1.8 {$A4-$D4},[$out],#64
-
- shl $A0,$ONE,#1 // 4 -> 8
- eor $A5,$A5,$A3
- eor $B5,$B5,$B3
- eor $C5,$C5,$C3
- eor $D5,$D5,$D3
- st1.8 {$A5-$D5},[$out],#64
-
- add @K[3],@K[3],$A0 // += 8
- add @K[4],@K[4],$A0
- add @K[5],@K[5],$A0
- add @K[6],@K[6],$A0
-
- b.hs .Loop_outer_512_neon
-
- adds $len,$len,#512
- ushr $A0,$ONE,#2 // 4 -> 1
-
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
-
- stp @K[0],$ONE,[sp,#0] // wipe off-load area
- stp @K[0],$ONE,[sp,#32]
- stp @K[0],$ONE,[sp,#64]
-
- b.eq .Ldone_512_neon
-
- cmp $len,#192
- sub @K[3],@K[3],$A0 // -= 1
- sub @K[4],@K[4],$A0
- sub @K[5],@K[5],$A0
- add sp,sp,#128
- b.hs .Loop_outer_neon
-
- eor @K[1],@K[1],@K[1]
- eor @K[2],@K[2],@K[2]
- eor @K[3],@K[3],@K[3]
- eor @K[4],@K[4],@K[4]
- eor @K[5],@K[5],@K[5]
- eor @K[6],@K[6],@K[6]
- b .Loop_outer
-
-.Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_512_neon,.-ChaCha20_512_neon
-#endif
-#endif
-___
-}
-}}}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
- (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
- (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
- (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
- (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
-
- print $_,"\n";
-}
-close STDOUT; # flush
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c
deleted file mode 100644
index 96ce01e2c133..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-asmlinkage void chacha20_mips(u32 state[16], u8 *out, const u8 *in,
- const size_t len);
-static bool *const chacha20_nobs[] __initconst = { };
-static void __init chacha20_fpu_init(void)
-{
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- chacha20_mips(ctx->state, dst, src, len);
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S
deleted file mode 100644
index a81e02db95e7..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S
+++ /dev/null
@@ -1,424 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#define MASK_U32 0x3c
-#define CHACHA20_BLOCK_SIZE 64
-#define STACK_SIZE 32
-
-#define X0 $t0
-#define X1 $t1
-#define X2 $t2
-#define X3 $t3
-#define X4 $t4
-#define X5 $t5
-#define X6 $t6
-#define X7 $t7
-#define X8 $t8
-#define X9 $t9
-#define X10 $v1
-#define X11 $s6
-#define X12 $s5
-#define X13 $s4
-#define X14 $s3
-#define X15 $s2
-/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
-#define T0 $s1
-#define T1 $s0
-#define T(n) T ## n
-#define X(n) X ## n
-
-/* Input arguments */
-#define STATE $a0
-#define OUT $a1
-#define IN $a2
-#define BYTES $a3
-
-/* Output argument */
-/* NONCE[0] is kept in a register and not in memory.
- * We don't want to touch original value in memory.
- * Must be incremented every loop iteration.
- */
-#define NONCE_0 $v0
-
-/* SAVED_X and SAVED_CA are set in the jump table.
- * Use regs which are overwritten on exit else we don't leak clear data.
- * They are used to handling the last bytes which are not multiple of 4.
- */
-#define SAVED_X X15
-#define SAVED_CA $s7
-
-#define IS_UNALIGNED $s7
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define MSB 0
-#define LSB 3
-#define ROTx rotl
-#define ROTR(n) rotr n, 24
-#define CPU_TO_LE32(n) \
- wsbh n; \
- rotr n, 16;
-#else
-#define MSB 3
-#define LSB 0
-#define ROTx rotr
-#define CPU_TO_LE32(n)
-#define ROTR(n)
-#endif
-
-#define FOR_EACH_WORD(x) \
- x( 0); \
- x( 1); \
- x( 2); \
- x( 3); \
- x( 4); \
- x( 5); \
- x( 6); \
- x( 7); \
- x( 8); \
- x( 9); \
- x(10); \
- x(11); \
- x(12); \
- x(13); \
- x(14); \
- x(15);
-
-#define FOR_EACH_WORD_REV(x) \
- x(15); \
- x(14); \
- x(13); \
- x(12); \
- x(11); \
- x(10); \
- x( 9); \
- x( 8); \
- x( 7); \
- x( 6); \
- x( 5); \
- x( 4); \
- x( 3); \
- x( 2); \
- x( 1); \
- x( 0);
-
-#define PLUS_ONE_0 1
-#define PLUS_ONE_1 2
-#define PLUS_ONE_2 3
-#define PLUS_ONE_3 4
-#define PLUS_ONE_4 5
-#define PLUS_ONE_5 6
-#define PLUS_ONE_6 7
-#define PLUS_ONE_7 8
-#define PLUS_ONE_8 9
-#define PLUS_ONE_9 10
-#define PLUS_ONE_10 11
-#define PLUS_ONE_11 12
-#define PLUS_ONE_12 13
-#define PLUS_ONE_13 14
-#define PLUS_ONE_14 15
-#define PLUS_ONE_15 16
-#define PLUS_ONE(x) PLUS_ONE_ ## x
-#define _CONCAT3(a,b,c) a ## b ## c
-#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
-
-#define STORE_UNALIGNED(x) \
-CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lwl T1, (x*4)+MSB ## (IN); \
- lwr T1, (x*4)+LSB ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- swl X ## x, (x*4)+MSB ## (OUT); \
- swr X ## x, (x*4)+LSB ## (OUT);
-
-#define STORE_ALIGNED(x) \
-CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lw T1, (x*4) ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- sw X ## x, (x*4) ## (OUT);
-
-/* Jump table macro.
- * Used for setup and handling the last bytes, which are not multiple of 4.
- * X15 is free to store Xn
- * Every jumptable entry must be equal in size.
- */
-#define JMPTBL_ALIGNED(x) \
-.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define JMPTBL_UNALIGNED(x) \
-.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
- addu X(A), X(K); \
- addu X(B), X(L); \
- addu X(C), X(M); \
- addu X(D), X(N); \
- xor X(V), X(A); \
- xor X(W), X(B); \
- xor X(Y), X(C); \
- xor X(Z), X(D); \
- rotl X(V), S; \
- rotl X(W), S; \
- rotl X(Y), S; \
- rotl X(Z), S;
-
-.text
-.set reorder
-.set noat
-.globl chacha20_mips
-.ent chacha20_mips
-chacha20_mips:
- .frame $sp, STACK_SIZE, $ra
-
- addiu $sp, -STACK_SIZE
-
- /* Return bytes = 0. */
- beqz BYTES, .Lchacha20_mips_end
-
- lw NONCE_0, 48(STATE)
-
- /* Save s0-s7 */
- sw $s0, 0($sp)
- sw $s1, 4($sp)
- sw $s2, 8($sp)
- sw $s3, 12($sp)
- sw $s4, 16($sp)
- sw $s5, 20($sp)
- sw $s6, 24($sp)
- sw $s7, 28($sp)
-
- /* Test IN or OUT is unaligned.
- * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
- */
- or IS_UNALIGNED, IN, OUT
- andi IS_UNALIGNED, 0x3
-
- /* Set number of rounds */
- li $at, 20
-
- b .Lchacha20_rounds_start
-
-.align 4
-.Loop_chacha20_rounds:
- addiu IN, CHACHA20_BLOCK_SIZE
- addiu OUT, CHACHA20_BLOCK_SIZE
- addiu NONCE_0, 1
-
-.Lchacha20_rounds_start:
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
-
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
-
- move X12, NONCE_0
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
-
-.Loop_chacha20_xor_rounds:
- addiu $at, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $at, .Loop_chacha20_xor_rounds
-
- addiu BYTES, -(CHACHA20_BLOCK_SIZE)
-
- /* Is data src/dst unaligned? Jump */
- bnez IS_UNALIGNED, .Loop_chacha20_unaligned
-
- /* Set number rounds here to fill delayslot. */
- li $at, 20
-
- /* BYTES < 0, it has no full block. */
- bltz BYTES, .Lchacha20_mips_no_full_block_aligned
-
- FOR_EACH_WORD_REV(STORE_ALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha20_rounds
-
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
-
- /* BYTES < 0? Handle last bytes */
- bltz BYTES, .Lchacha20_mips_xor_bytes
-
-.Lchacha20_mips_xor_done:
- /* Restore used registers */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
- lw $s6, 24($sp)
- lw $s7, 28($sp)
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
-.Lchacha20_mips_end:
- addiu $sp, STACK_SIZE
- jr $ra
-
-.Lchacha20_mips_no_full_block_aligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_ALIGNED)
-
-
-.Loop_chacha20_unaligned:
- /* Set number rounds here to fill delayslot. */
- li $at, 20
-
- /* BYTES > 0, it has no full block. */
- bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
-
- FOR_EACH_WORD_REV(STORE_UNALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha20_rounds
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
- .set noreorder
- /* Fall through to byte handling */
- bgez BYTES, .Lchacha20_mips_xor_done
-.Lchacha20_mips_xor_unaligned_0_b:
-.Lchacha20_mips_xor_aligned_0_b:
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
- .set reorder
-
-.Lchacha20_mips_xor_bytes:
- addu IN, $at
- addu OUT, $at
- /* First byte */
- lbu T1, 0(IN)
- addiu $at, BYTES, 1
- CPU_TO_LE32(SAVED_X)
- ROTR(SAVED_X)
- xor T1, SAVED_X
- sb T1, 0(OUT)
- beqz $at, .Lchacha20_mips_xor_done
- /* Second byte */
- lbu T1, 1(IN)
- addiu $at, BYTES, 2
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 1(OUT)
- beqz $at, .Lchacha20_mips_xor_done
- /* Third byte */
- lbu T1, 2(IN)
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 2(OUT)
- b .Lchacha20_mips_xor_done
-
-.Lchacha20_mips_no_full_block_unaligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_UNALIGNED)
-.end chacha20_mips
-.set at
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c
deleted file mode 100644
index 1bccec70845c..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-#ifdef __linux__
-#include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/intel-family.h>
-#else
-#include <sys/simd-x86_64.h>
-#endif
-
-asmlinkage void hchacha20_ssse3(u32 *derived_key, const u8 *nonce,
- const u8 *key);
-asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-
-static bool chacha20_use_ssse3 __ro_after_init;
-static bool chacha20_use_avx2 __ro_after_init;
-static bool chacha20_use_avx512 __ro_after_init;
-static bool chacha20_use_avx512vl __ro_after_init;
-static bool *const chacha20_nobs[] __initconst = {
- &chacha20_use_ssse3, &chacha20_use_avx2, &chacha20_use_avx512,
- &chacha20_use_avx512vl };
-
-static void __init chacha20_fpu_init(void)
-{
-#ifdef __linux__
- chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
- chacha20_use_avx2 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifndef COMPAT_CANNOT_USE_AVX512
- chacha20_use_avx512 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
- chacha20_use_avx512vl =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL);
-#endif
-#else
- chacha20_use_ssse3 = !!(cpu_feature2 & CPUID2_SSSE3);
- chacha20_use_avx2 = !!(cpu_feature2 & CPUID2_AVX) &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX2) &&
- __ymm_enabled();
- chacha20_use_avx512 = chacha20_use_avx2 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- __zmm_enabled();
- chacha20_use_avx512vl = chacha20_use_avx512 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL);
-#endif
- if (bootverbose)
- printf("ssse3: %d avx2: %d avx512: %d avx512vl: %d\n",
- chacha20_use_ssse3,
- chacha20_use_avx2,
- chacha20_use_avx512,
- chacha20_use_avx512vl);
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE ||
- PAGE_SIZE % CHACHA20_BLOCK_SIZE);
-
- if (!chacha20_use_ssse3) {
- return false;
- }
- if (len <= CHACHA20_BLOCK_SIZE) {
- return false;
- }
- if (!simd_use(simd_context)) {
- return false;
- }
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- if (chacha20_use_avx512 &&
- len >= CHACHA20_BLOCK_SIZE * 8)
- chacha20_avx512(dst, src, bytes, ctx->key, ctx->counter);
- else if (chacha20_use_avx512vl &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx512vl(dst, src, bytes, ctx->key, ctx->counter);
- else if (chacha20_use_avx2 &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx2(dst, src, bytes, ctx->key, ctx->counter);
- else
- chacha20_ssse3(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
- len -= bytes;
- if (!len)
- break;
- dst += bytes;
- src += bytes;
- simd_relax(simd_context);
- }
-
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- if (IS_ENABLED(CONFIG_AS_SSSE3) && chacha20_use_ssse3 &&
- simd_use(simd_context)) {
- hchacha20_ssse3(derived_key, nonce, key);
- return true;
- }
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl
deleted file mode 100755
index 29906a66b8b7..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl
+++ /dev/null
@@ -1,4106 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# November 2014
-#
-# ChaCha20 for x86_64.
-#
-# December 2016
-#
-# Add AVX512F code path.
-#
-# December 2017
-#
-# Add AVX512VL code path.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
-#
-# P4 9.48/+99% - -
-# Core2 7.83/+55% 7.90/5.76 4.35
-# Westmere 7.19/+50% 5.60/4.50 3.00
-# Sandy Bridge 8.31/+42% 5.45/4.00 2.72
-# Ivy Bridge 6.71/+46% 5.40/? 2.41
-# Haswell 5.92/+43% 5.20/3.45 2.42 1.23
-# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
-# Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
-# Knights L 11.7/- ? 9.60(iii) 0.80
-# Goldmont 10.6/+17% 5.10/3.52 3.28
-# Sledgehammer 7.28/+52% - -
-# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
-# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
-# VIA Nano 10.5/+46% 6.72/6.88 6.05
-#
-# (i) compared to older gcc 3.x one can observe >2x improvement on
-# most platforms;
-# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
-# by chacha20_poly1305_tls_cipher, results are EVP-free;
-# (iii) this is not optimal result for Atom because of MSROM
-# limitations, SSE2 can do better, but gain is considered too
-# low to justify the [maintenance] effort;
-# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
-# and 4.85 for 128-byte inputs;
-# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
-# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
-# cpb in single thread, the corresponding capability is suppressed;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-# input parameter block
-($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
-
-$code.=<<___ if $kernel;
-#include <linux/linkage.h>
-___
-
-sub declare_variable() {
- my ($name, $size, $type, $payload) = @_;
- if($kernel) {
- $code.=".section .rodata.cst$size.L$name, \"aM\", \@progbits, $size\n";
- $code.=".align $size\n";
- $code.=".L$name:\n";
- $code.=".$type $payload\n";
- } else {
- $code.=".L$name:\n";
- $code.=".$type $payload\n";
- }
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= ".align $align\n";
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-if(!$kernel) {
- $code .= ".text\n";
-}
-&declare_variable('zero', 16, 'long', '0,0,0,0');
-&declare_variable('one', 16, 'long', '1,0,0,0');
-&declare_variable('inc', 16, 'long', '0,1,2,3');
-&declare_variable('four', 16, 'long', '4,4,4,4');
-&declare_variable('incy', 32, 'long', '0,2,4,6,1,3,5,7');
-&declare_variable('eight', 32, 'long', '8,8,8,8,8,8,8,8');
-&declare_variable('rot16', 16, 'byte', '0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd');
-&declare_variable('rot24', 16, 'byte', '0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe');
-&declare_variable('twoy', 32, 'long', '2,0,0,0, 2,0,0,0');
-&declare_variable('zeroz', 64, 'long', '0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0');
-&declare_variable('fourz', 64, 'long', '4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0');
-&declare_variable('incz', 64, 'long', '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15');
-&declare_variable('sixteen', 64, 'long', '16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16');
-&declare_variable('sigma', 16, 'ascii', '"expand 32-byte k"');
-
-$code.=<<___ if !$kernel;
-.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-___
-$code.=".text\n";
-
-sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- my $arg = pop;
- $arg = "\$$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-
-@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
- "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
-@t=("%esi","%edi");
-
-sub ROUND { # critical path is 24 cycles per round
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- # Normally instructions would be interleaved to favour in-order
- # execution. Generally out-of-order cores manage it gracefully,
- # but not this time for some reason. As in-order execution
- # cores are dying breed, old Atom is the only one around,
- # instructions are left uninterleaved. Besides, Atom is better
- # off executing 1xSSSE3 code anyway...
-
- (
- "&add (@x[$a0],@x[$b0])", # Q1
- "&xor (@x[$d0],@x[$a0])",
- "&rol (@x[$d0],16)",
- "&add (@x[$a1],@x[$b1])", # Q2
- "&xor (@x[$d1],@x[$a1])",
- "&rol (@x[$d1],16)",
-
- "&add ($xc,@x[$d0])",
- "&xor (@x[$b0],$xc)",
- "&rol (@x[$b0],12)",
- "&add ($xc_,@x[$d1])",
- "&xor (@x[$b1],$xc_)",
- "&rol (@x[$b1],12)",
-
- "&add (@x[$a0],@x[$b0])",
- "&xor (@x[$d0],@x[$a0])",
- "&rol (@x[$d0],8)",
- "&add (@x[$a1],@x[$b1])",
- "&xor (@x[$d1],@x[$a1])",
- "&rol (@x[$d1],8)",
-
- "&add ($xc,@x[$d0])",
- "&xor (@x[$b0],$xc)",
- "&rol (@x[$b0],7)",
- "&add ($xc_,@x[$d1])",
- "&xor (@x[$b1],$xc_)",
- "&rol (@x[$b1],7)",
-
- "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
- "&mov (\"4*$c1(%rsp)\",$xc_)",
- "&mov ($xc,\"4*$c2(%rsp)\")",
- "&mov ($xc_,\"4*$c3(%rsp)\")",
-
- "&add (@x[$a2],@x[$b2])", # Q3
- "&xor (@x[$d2],@x[$a2])",
- "&rol (@x[$d2],16)",
- "&add (@x[$a3],@x[$b3])", # Q4
- "&xor (@x[$d3],@x[$a3])",
- "&rol (@x[$d3],16)",
-
- "&add ($xc,@x[$d2])",
- "&xor (@x[$b2],$xc)",
- "&rol (@x[$b2],12)",
- "&add ($xc_,@x[$d3])",
- "&xor (@x[$b3],$xc_)",
- "&rol (@x[$b3],12)",
-
- "&add (@x[$a2],@x[$b2])",
- "&xor (@x[$d2],@x[$a2])",
- "&rol (@x[$d2],8)",
- "&add (@x[$a3],@x[$b3])",
- "&xor (@x[$d3],@x[$a3])",
- "&rol (@x[$d3],8)",
-
- "&add ($xc,@x[$d2])",
- "&xor (@x[$b2],$xc)",
- "&rol (@x[$b2],7)",
- "&add ($xc_,@x[$d3])",
- "&xor (@x[$b3],$xc_)",
- "&rol (@x[$b3],7)"
- );
-}
-
-########################################################################
-# Generic code path that handles all lengths on pre-SSSE3 processors.
-if(!$kernel) {
-&declare_function("chacha20_ctr32", 64, 5);
-$code.=<<___;
-.cfi_startproc
- cmp \$0,$len
- je .Lno_data
- mov OPENSSL_ia32cap_P+4(%rip),%r9
-___
-$code.=<<___ if ($avx>2);
- bt \$48,%r9 # check for AVX512F
- jc .Lchacha20_avx512
- test %r9,%r9 # check for AVX512VL
- js .Lchacha20_avx512vl
-___
-$code.=<<___;
- test \$`1<<(41-32)`,%r9d
- jnz .Lchacha20_ssse3
-___
-$code.=<<___;
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- sub \$64+24,%rsp
-.cfi_adjust_cfa_offset 64+24
-.Lctr32_body:
-
- #movdqa .Lsigma(%rip),%xmm0
- movdqu ($key),%xmm1
- movdqu 16($key),%xmm2
- movdqu ($counter),%xmm3
- movdqa .Lone(%rip),%xmm4
-
- #movdqa %xmm0,4*0(%rsp) # key[0]
- movdqa %xmm1,4*4(%rsp) # key[1]
- movdqa %xmm2,4*8(%rsp) # key[2]
- movdqa %xmm3,4*12(%rsp) # key[3]
- mov $len,%rbp # reassign $len
- jmp .Loop_outer
-
-.align 32
-.Loop_outer:
- mov \$0x61707865,@x[0] # 'expa'
- mov \$0x3320646e,@x[1] # 'nd 3'
- mov \$0x79622d32,@x[2] # '2-by'
- mov \$0x6b206574,@x[3] # 'te k'
- mov 4*4(%rsp),@x[4]
- mov 4*5(%rsp),@x[5]
- mov 4*6(%rsp),@x[6]
- mov 4*7(%rsp),@x[7]
- movd %xmm3,@x[12]
- mov 4*13(%rsp),@x[13]
- mov 4*14(%rsp),@x[14]
- mov 4*15(%rsp),@x[15]
-
- mov %rbp,64+0(%rsp) # save len
- mov \$10,%ebp
- mov $inp,64+8(%rsp) # save inp
- movq %xmm2,%rsi # "@x[8]"
- mov $out,64+16(%rsp) # save out
- mov %rsi,%rdi
- shr \$32,%rdi # "@x[9]"
- jmp .Loop
-
-.align 32
-.Loop:
-___
- foreach (&ROUND (0, 4, 8,12)) { eval; }
- foreach (&ROUND (0, 5,10,15)) { eval; }
- &dec ("%ebp");
- &jnz (".Loop");
-
-$code.=<<___;
- mov @t[1],4*9(%rsp) # modulo-scheduled
- mov @t[0],4*8(%rsp)
- mov 64(%rsp),%rbp # load len
- movdqa %xmm2,%xmm1
- mov 64+8(%rsp),$inp # load inp
- paddd %xmm4,%xmm3 # increment counter
- mov 64+16(%rsp),$out # load out
-
- add \$0x61707865,@x[0] # 'expa'
- add \$0x3320646e,@x[1] # 'nd 3'
- add \$0x79622d32,@x[2] # '2-by'
- add \$0x6b206574,@x[3] # 'te k'
- add 4*4(%rsp),@x[4]
- add 4*5(%rsp),@x[5]
- add 4*6(%rsp),@x[6]
- add 4*7(%rsp),@x[7]
- add 4*12(%rsp),@x[12]
- add 4*13(%rsp),@x[13]
- add 4*14(%rsp),@x[14]
- add 4*15(%rsp),@x[15]
- paddd 4*8(%rsp),%xmm1
-
- cmp \$64,%rbp
- jb .Ltail
-
- xor 4*0($inp),@x[0] # xor with input
- xor 4*1($inp),@x[1]
- xor 4*2($inp),@x[2]
- xor 4*3($inp),@x[3]
- xor 4*4($inp),@x[4]
- xor 4*5($inp),@x[5]
- xor 4*6($inp),@x[6]
- xor 4*7($inp),@x[7]
- movdqu 4*8($inp),%xmm0
- xor 4*12($inp),@x[12]
- xor 4*13($inp),@x[13]
- xor 4*14($inp),@x[14]
- xor 4*15($inp),@x[15]
- lea 4*16($inp),$inp # inp+=64
- pxor %xmm1,%xmm0
-
- movdqa %xmm2,4*8(%rsp)
- movd %xmm3,4*12(%rsp)
-
- mov @x[0],4*0($out) # write output
- mov @x[1],4*1($out)
- mov @x[2],4*2($out)
- mov @x[3],4*3($out)
- mov @x[4],4*4($out)
- mov @x[5],4*5($out)
- mov @x[6],4*6($out)
- mov @x[7],4*7($out)
- movdqu %xmm0,4*8($out)
- mov @x[12],4*12($out)
- mov @x[13],4*13($out)
- mov @x[14],4*14($out)
- mov @x[15],4*15($out)
- lea 4*16($out),$out # out+=64
-
- sub \$64,%rbp
- jnz .Loop_outer
-
- jmp .Ldone
-
-.align 16
-.Ltail:
- mov @x[0],4*0(%rsp)
- mov @x[1],4*1(%rsp)
- xor %rbx,%rbx
- mov @x[2],4*2(%rsp)
- mov @x[3],4*3(%rsp)
- mov @x[4],4*4(%rsp)
- mov @x[5],4*5(%rsp)
- mov @x[6],4*6(%rsp)
- mov @x[7],4*7(%rsp)
- movdqa %xmm1,4*8(%rsp)
- mov @x[12],4*12(%rsp)
- mov @x[13],4*13(%rsp)
- mov @x[14],4*14(%rsp)
- mov @x[15],4*15(%rsp)
-
-.Loop_tail:
- movzb ($inp,%rbx),%eax
- movzb (%rsp,%rbx),%edx
- lea 1(%rbx),%rbx
- xor %edx,%eax
- mov %al,-1($out,%rbx)
- dec %rbp
- jnz .Loop_tail
-
-.Ldone:
- add \$64+24,%rsp
-.cfi_adjust_cfa_offset -64-24
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbp
-.cfi_restore %rbp
- pop %rbx
-.cfi_restore %rbx
-.Lno_data:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_ctr32");
-}
-
-########################################################################
-# SSSE3 code path that handles shorter lengths
-{
-my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
-
-sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
- &paddd ($a,$b);
- &pxor ($d,$a);
- &pshufb ($d,$rot16);
-
- &paddd ($c,$d);
- &pxor ($b,$c);
- &movdqa ($t,$b);
- &psrld ($b,20);
- &pslld ($t,12);
- &por ($b,$t);
-
- &paddd ($a,$b);
- &pxor ($d,$a);
- &pshufb ($d,$rot24);
-
- &paddd ($c,$d);
- &pxor ($b,$c);
- &movdqa ($t,$b);
- &psrld ($b,25);
- &pslld ($t,7);
- &por ($b,$t);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_SSSE3\n";
-}
-
-if($kernel) {
-&declare_function("hchacha20_ssse3", 32, 5);
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($len),$b
- movdqu 16($len),$c
- movdqu ($inp),$d
- # This code is only used when targeting kernel.
- # If targeting win64, xmm{6,7} preserving needs to be added.
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
- mov \$10,$counter # reuse $counter
- jmp 1f
-.align 32
-1:
-___
- &SSSE3ROUND();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &nop ();
-
- &SSSE3ROUND();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
-
- &dec ($counter);
- &jnz ("1b");
-
-$code.=<<___;
- movdqu $a, ($out)
- movdqu $d, 16($out)
- ret
-___
-&end_function("hchacha20_ssse3");
-}
-
-&declare_function("chacha20_ssse3", 32, 5);
-$code.=<<___;
-.cfi_startproc
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
-___
-$code.=<<___ if ($avx && !$kernel);
- test \$`1<<(43-32)`,%r10d
- jnz .Lchacha20_4xop # XOP is fastest even if we use 1/4
-___
-$code.=<<___;
- cmp \$128,$len # we might throw away some data,
- je .Lchacha20_128
- ja .Lchacha20_4x # but overall it won't be slower
-
-.Ldo_ssse3_after_all:
- sub \$64+$xframe,%rsp
- and \$-16,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lssse3_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($key),$b
- movdqu 16($key),$c
- movdqu ($counter),$d
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
-
- movdqa $a,0x00(%rsp)
- movdqa $b,0x10(%rsp)
- movdqa $c,0x20(%rsp)
- movdqa $d,0x30(%rsp)
- mov \$10,$counter # reuse $counter
- jmp .Loop_ssse3
-
-.align 32
-.Loop_outer_ssse3:
- movdqa .Lone(%rip),$d
- movdqa 0x00(%rsp),$a
- movdqa 0x10(%rsp),$b
- movdqa 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
- mov \$10,$counter
- movdqa $d,0x30(%rsp)
- jmp .Loop_ssse3
-
-.align 32
-.Loop_ssse3:
-___
- &SSSE3ROUND();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &nop ();
-
- &SSSE3ROUND();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
-
- &dec ($counter);
- &jnz (".Loop_ssse3");
-
-$code.=<<___;
- paddd 0x00(%rsp),$a
- paddd 0x10(%rsp),$b
- paddd 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
-
- cmp \$64,$len
- jb .Ltail_ssse3
-
- movdqu 0x00($inp),$t
- movdqu 0x10($inp),$t1
- pxor $t,$a # xor with input
- movdqu 0x20($inp),$t
- pxor $t1,$b
- movdqu 0x30($inp),$t1
- lea 0x40($inp),$inp # inp+=64
- pxor $t,$c
- pxor $t1,$d
-
- movdqu $a,0x00($out) # write output
- movdqu $b,0x10($out)
- movdqu $c,0x20($out)
- movdqu $d,0x30($out)
- lea 0x40($out),$out # out+=64
-
- sub \$64,$len
- jnz .Loop_outer_ssse3
-
- jmp .Ldone_ssse3
-
-.align 16
-.Ltail_ssse3:
- movdqa $a,0x00(%rsp)
- movdqa $b,0x10(%rsp)
- movdqa $c,0x20(%rsp)
- movdqa $d,0x30(%rsp)
- xor $counter,$counter
-
-.Loop_tail_ssse3:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_ssse3
-
-.Ldone_ssse3:
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lssse3_epilogue:
- ret
-.cfi_endproc
-___
-}
-&end_function("chacha20_ssse3");
-
-########################################################################
-# SSSE3 code path that handles 128-byte inputs
-{
-my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
-my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
-
-sub SSSE3ROUND_2x {
- &paddd ($a,$b);
- &pxor ($d,$a);
- &paddd ($a1,$b1);
- &pxor ($d1,$a1);
- &pshufb ($d,$rot16);
- &pshufb($d1,$rot16);
-
- &paddd ($c,$d);
- &paddd ($c1,$d1);
- &pxor ($b,$c);
- &pxor ($b1,$c1);
- &movdqa ($t,$b);
- &psrld ($b,20);
- &movdqa($t1,$b1);
- &pslld ($t,12);
- &psrld ($b1,20);
- &por ($b,$t);
- &pslld ($t1,12);
- &por ($b1,$t1);
-
- &paddd ($a,$b);
- &pxor ($d,$a);
- &paddd ($a1,$b1);
- &pxor ($d1,$a1);
- &pshufb ($d,$rot24);
- &pshufb($d1,$rot24);
-
- &paddd ($c,$d);
- &paddd ($c1,$d1);
- &pxor ($b,$c);
- &pxor ($b1,$c1);
- &movdqa ($t,$b);
- &psrld ($b,25);
- &movdqa($t1,$b1);
- &pslld ($t,7);
- &psrld ($b1,25);
- &por ($b,$t);
- &pslld ($t1,7);
- &por ($b1,$t1);
-}
-
-my $xframe = $win64 ? 0x68 : 8;
-
-$code.=<<___;
-.type chacha20_128,\@function,5
-.align 32
-chacha20_128:
-.cfi_startproc
-.Lchacha20_128:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-16,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x70(%r10)
- movaps %xmm7,-0x60(%r10)
- movaps %xmm8,-0x50(%r10)
- movaps %xmm9,-0x40(%r10)
- movaps %xmm10,-0x30(%r10)
- movaps %xmm11,-0x20(%r10)
-.L128_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($key),$b
- movdqu 16($key),$c
- movdqu ($counter),$d
- movdqa .Lone(%rip),$d1
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
-
- movdqa $a,$a1
- movdqa $a,0x00(%rsp)
- movdqa $b,$b1
- movdqa $b,0x10(%rsp)
- movdqa $c,$c1
- movdqa $c,0x20(%rsp)
- paddd $d,$d1
- movdqa $d,0x30(%rsp)
- mov \$10,$counter # reuse $counter
- jmp .Loop_128
-
-.align 32
-.Loop_128:
-___
- &SSSE3ROUND_2x();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &pshufd ($a1,$a1,0b10010011);
- &pshufd ($d1,$d1,0b01001110);
- &pshufd ($c1,$c1,0b00111001);
-
- &SSSE3ROUND_2x();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
- &pshufd ($a1,$a1,0b00111001);
- &pshufd ($d1,$d1,0b01001110);
- &pshufd ($c1,$c1,0b10010011);
-
- &dec ($counter);
- &jnz (".Loop_128");
-
-$code.=<<___;
- paddd 0x00(%rsp),$a
- paddd 0x10(%rsp),$b
- paddd 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
- paddd .Lone(%rip),$d1
- paddd 0x00(%rsp),$a1
- paddd 0x10(%rsp),$b1
- paddd 0x20(%rsp),$c1
- paddd 0x30(%rsp),$d1
-
- movdqu 0x00($inp),$t
- movdqu 0x10($inp),$t1
- pxor $t,$a # xor with input
- movdqu 0x20($inp),$t
- pxor $t1,$b
- movdqu 0x30($inp),$t1
- pxor $t,$c
- movdqu 0x40($inp),$t
- pxor $t1,$d
- movdqu 0x50($inp),$t1
- pxor $t,$a1
- movdqu 0x60($inp),$t
- pxor $t1,$b1
- movdqu 0x70($inp),$t1
- pxor $t,$c1
- pxor $t1,$d1
-
- movdqu $a,0x00($out) # write output
- movdqu $b,0x10($out)
- movdqu $c,0x20($out)
- movdqu $d,0x30($out)
- movdqu $a1,0x40($out)
- movdqu $b1,0x50($out)
- movdqu $c1,0x60($out)
- movdqu $d1,0x70($out)
-___
-$code.=<<___ if ($win64);
- movaps -0x70(%r10),%xmm6
- movaps -0x60(%r10),%xmm7
- movaps -0x50(%r10),%xmm8
- movaps -0x40(%r10),%xmm9
- movaps -0x30(%r10),%xmm10
- movaps -0x20(%r10),%xmm11
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L128_epilogue:
- ret
-.cfi_endproc
-.size chacha20_128,.-chacha20_128
-___
-}
-
-########################################################################
-# SSSE3 code path that handles longer messages.
-{
-# assign variables to favor Atom front-end
-my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
- $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
-
-sub SSSE3_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
-my @x=map("\"$_\"",@xx);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- (
- "&paddd (@x[$a0],@x[$b0])", # Q1
- "&paddd (@x[$a1],@x[$b1])", # Q2
- "&pxor (@x[$d0],@x[$a0])",
- "&pxor (@x[$d1],@x[$a1])",
- "&pshufb (@x[$d0],$t1)",
- "&pshufb (@x[$d1],$t1)",
-
- "&paddd ($xc,@x[$d0])",
- "&paddd ($xc_,@x[$d1])",
- "&pxor (@x[$b0],$xc)",
- "&pxor (@x[$b1],$xc_)",
- "&movdqa ($t0,@x[$b0])",
- "&pslld (@x[$b0],12)",
- "&psrld ($t0,20)",
- "&movdqa ($t1,@x[$b1])",
- "&pslld (@x[$b1],12)",
- "&por (@x[$b0],$t0)",
- "&psrld ($t1,20)",
- "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
- "&por (@x[$b1],$t1)",
-
- "&paddd (@x[$a0],@x[$b0])",
- "&paddd (@x[$a1],@x[$b1])",
- "&pxor (@x[$d0],@x[$a0])",
- "&pxor (@x[$d1],@x[$a1])",
- "&pshufb (@x[$d0],$t0)",
- "&pshufb (@x[$d1],$t0)",
-
- "&paddd ($xc,@x[$d0])",
- "&paddd ($xc_,@x[$d1])",
- "&pxor (@x[$b0],$xc)",
- "&pxor (@x[$b1],$xc_)",
- "&movdqa ($t1,@x[$b0])",
- "&pslld (@x[$b0],7)",
- "&psrld ($t1,25)",
- "&movdqa ($t0,@x[$b1])",
- "&pslld (@x[$b1],7)",
- "&por (@x[$b0],$t1)",
- "&psrld ($t0,25)",
- "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
- "&por (@x[$b1],$t0)",
-
- "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
- "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
- "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
- "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
-
- "&paddd (@x[$a2],@x[$b2])", # Q3
- "&paddd (@x[$a3],@x[$b3])", # Q4
- "&pxor (@x[$d2],@x[$a2])",
- "&pxor (@x[$d3],@x[$a3])",
- "&pshufb (@x[$d2],$t1)",
- "&pshufb (@x[$d3],$t1)",
-
- "&paddd ($xc,@x[$d2])",
- "&paddd ($xc_,@x[$d3])",
- "&pxor (@x[$b2],$xc)",
- "&pxor (@x[$b3],$xc_)",
- "&movdqa ($t0,@x[$b2])",
- "&pslld (@x[$b2],12)",
- "&psrld ($t0,20)",
- "&movdqa ($t1,@x[$b3])",
- "&pslld (@x[$b3],12)",
- "&por (@x[$b2],$t0)",
- "&psrld ($t1,20)",
- "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
- "&por (@x[$b3],$t1)",
-
- "&paddd (@x[$a2],@x[$b2])",
- "&paddd (@x[$a3],@x[$b3])",
- "&pxor (@x[$d2],@x[$a2])",
- "&pxor (@x[$d3],@x[$a3])",
- "&pshufb (@x[$d2],$t0)",
- "&pshufb (@x[$d3],$t0)",
-
- "&paddd ($xc,@x[$d2])",
- "&paddd ($xc_,@x[$d3])",
- "&pxor (@x[$b2],$xc)",
- "&pxor (@x[$b3],$xc_)",
- "&movdqa ($t1,@x[$b2])",
- "&pslld (@x[$b2],7)",
- "&psrld ($t1,25)",
- "&movdqa ($t0,@x[$b3])",
- "&pslld (@x[$b3],7)",
- "&por (@x[$b2],$t1)",
- "&psrld ($t0,25)",
- "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
- "&por (@x[$b3],$t0)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type chacha20_4x,\@function,5
-.align 32
-chacha20_4x:
-.cfi_startproc
-.Lchacha20_4x:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
-___
-$code.=<<___ if (!$kernel);
- mov %r9,%r11
-___
-$code.=<<___ if ($avx>1 && !$kernel);
- shr \$32,%r9 # OPENSSL_ia32cap_P+8
- test \$`1<<5`,%r9 # test AVX2
- jnz .Lchacha20_8x
-___
-$code.=<<___;
- cmp \$192,$len
- ja .Lproceed4x
-___
-$code.=<<___ if (!$kernel);
- and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
- cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
- je .Ldo_ssse3_after_all # to detect Atom
-___
-$code.=<<___;
-.Lproceed4x:
- sub \$0x140+$xframe,%rsp
- and \$-16,%rsp
-___
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x40 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x100 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x140
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L4x_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$xa3 # key[0]
- movdqu ($key),$xb3 # key[1]
- movdqu 16($key),$xt3 # key[2]
- movdqu ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- pshufd \$0x55,$xa3,$xa1
- movdqa $xa0,0x40(%rsp) # ... and offload
- pshufd \$0xaa,$xa3,$xa2
- movdqa $xa1,0x50(%rsp)
- pshufd \$0xff,$xa3,$xa3
- movdqa $xa2,0x60(%rsp)
- movdqa $xa3,0x70(%rsp)
-
- pshufd \$0x00,$xb3,$xb0
- pshufd \$0x55,$xb3,$xb1
- movdqa $xb0,0x80-0x100(%rcx)
- pshufd \$0xaa,$xb3,$xb2
- movdqa $xb1,0x90-0x100(%rcx)
- pshufd \$0xff,$xb3,$xb3
- movdqa $xb2,0xa0-0x100(%rcx)
- movdqa $xb3,0xb0-0x100(%rcx)
-
- pshufd \$0x00,$xt3,$xt0 # "$xc0"
- pshufd \$0x55,$xt3,$xt1 # "$xc1"
- movdqa $xt0,0xc0-0x100(%rcx)
- pshufd \$0xaa,$xt3,$xt2 # "$xc2"
- movdqa $xt1,0xd0-0x100(%rcx)
- pshufd \$0xff,$xt3,$xt3 # "$xc3"
- movdqa $xt2,0xe0-0x100(%rcx)
- movdqa $xt3,0xf0-0x100(%rcx)
-
- pshufd \$0x00,$xd3,$xd0
- pshufd \$0x55,$xd3,$xd1
- paddd .Linc(%rip),$xd0 # don't save counters yet
- pshufd \$0xaa,$xd3,$xd2
- movdqa $xd1,0x110-0x100(%rcx)
- pshufd \$0xff,$xd3,$xd3
- movdqa $xd2,0x120-0x100(%rcx)
- movdqa $xd3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4x
-
-.align 32
-.Loop_outer4x:
- movdqa 0x40(%rsp),$xa0 # re-load smashed key
- movdqa 0x50(%rsp),$xa1
- movdqa 0x60(%rsp),$xa2
- movdqa 0x70(%rsp),$xa3
- movdqa 0x80-0x100(%rcx),$xb0
- movdqa 0x90-0x100(%rcx),$xb1
- movdqa 0xa0-0x100(%rcx),$xb2
- movdqa 0xb0-0x100(%rcx),$xb3
- movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
- movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
- movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
- movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
- movdqa 0x100-0x100(%rcx),$xd0
- movdqa 0x110-0x100(%rcx),$xd1
- movdqa 0x120-0x100(%rcx),$xd2
- movdqa 0x130-0x100(%rcx),$xd3
- paddd .Lfour(%rip),$xd0 # next SIMD counters
-
-.Loop_enter4x:
- movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
- movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
- movdqa (%r9),$xt3 # .Lrot16(%rip)
- mov \$10,%eax
- movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4x
-
-.align 32
-.Loop4x:
-___
- foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop4x
-
- paddd 0x40(%rsp),$xa0 # accumulate key material
- paddd 0x50(%rsp),$xa1
- paddd 0x60(%rsp),$xa2
- paddd 0x70(%rsp),$xa3
-
- movdqa $xa0,$xt2 # "de-interlace" data
- punpckldq $xa1,$xa0
- movdqa $xa2,$xt3
- punpckldq $xa3,$xa2
- punpckhdq $xa1,$xt2
- punpckhdq $xa3,$xt3
- movdqa $xa0,$xa1
- punpcklqdq $xa2,$xa0 # "a0"
- movdqa $xt2,$xa3
- punpcklqdq $xt3,$xt2 # "a2"
- punpckhqdq $xa2,$xa1 # "a1"
- punpckhqdq $xt3,$xa3 # "a3"
-___
- ($xa2,$xt2)=($xt2,$xa2);
-$code.=<<___;
- paddd 0x80-0x100(%rcx),$xb0
- paddd 0x90-0x100(%rcx),$xb1
- paddd 0xa0-0x100(%rcx),$xb2
- paddd 0xb0-0x100(%rcx),$xb3
-
- movdqa $xa0,0x00(%rsp) # offload $xaN
- movdqa $xa1,0x10(%rsp)
- movdqa 0x20(%rsp),$xa0 # "xc2"
- movdqa 0x30(%rsp),$xa1 # "xc3"
-
- movdqa $xb0,$xt2
- punpckldq $xb1,$xb0
- movdqa $xb2,$xt3
- punpckldq $xb3,$xb2
- punpckhdq $xb1,$xt2
- punpckhdq $xb3,$xt3
- movdqa $xb0,$xb1
- punpcklqdq $xb2,$xb0 # "b0"
- movdqa $xt2,$xb3
- punpcklqdq $xt3,$xt2 # "b2"
- punpckhqdq $xb2,$xb1 # "b1"
- punpckhqdq $xt3,$xb3 # "b3"
-___
- ($xb2,$xt2)=($xt2,$xb2);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- paddd 0xc0-0x100(%rcx),$xc0
- paddd 0xd0-0x100(%rcx),$xc1
- paddd 0xe0-0x100(%rcx),$xc2
- paddd 0xf0-0x100(%rcx),$xc3
-
- movdqa $xa2,0x20(%rsp) # keep offloading $xaN
- movdqa $xa3,0x30(%rsp)
-
- movdqa $xc0,$xt2
- punpckldq $xc1,$xc0
- movdqa $xc2,$xt3
- punpckldq $xc3,$xc2
- punpckhdq $xc1,$xt2
- punpckhdq $xc3,$xt3
- movdqa $xc0,$xc1
- punpcklqdq $xc2,$xc0 # "c0"
- movdqa $xt2,$xc3
- punpcklqdq $xt3,$xt2 # "c2"
- punpckhqdq $xc2,$xc1 # "c1"
- punpckhqdq $xt3,$xc3 # "c3"
-___
- ($xc2,$xt2)=($xt2,$xc2);
- ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
-$code.=<<___;
- paddd 0x100-0x100(%rcx),$xd0
- paddd 0x110-0x100(%rcx),$xd1
- paddd 0x120-0x100(%rcx),$xd2
- paddd 0x130-0x100(%rcx),$xd3
-
- movdqa $xd0,$xt2
- punpckldq $xd1,$xd0
- movdqa $xd2,$xt3
- punpckldq $xd3,$xd2
- punpckhdq $xd1,$xt2
- punpckhdq $xd3,$xt3
- movdqa $xd0,$xd1
- punpcklqdq $xd2,$xd0 # "d0"
- movdqa $xt2,$xd3
- punpcklqdq $xt3,$xt2 # "d2"
- punpckhqdq $xd2,$xd1 # "d1"
- punpckhqdq $xt3,$xd3 # "d3"
-___
- ($xd2,$xt2)=($xt2,$xd2);
-$code.=<<___;
- cmp \$64*4,$len
- jb .Ltail4x
-
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # size optimization
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
-
- movdqu $xt0,0x40($out)
- movdqu 0x00($inp),$xt0
- movdqu $xt1,0x50($out)
- movdqu 0x10($inp),$xt1
- movdqu $xt2,0x60($out)
- movdqu 0x20($inp),$xt2
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # size optimization
- movdqu 0x30($inp),$xt3
- pxor 0x20(%rsp),$xt0
- pxor $xb2,$xt1
- pxor $xc2,$xt2
- pxor $xd2,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # inp+=64*4
- pxor 0x30(%rsp),$xt0
- pxor $xb3,$xt1
- pxor $xc3,$xt2
- pxor $xd3,$xt3
- movdqu $xt0,0x40($out)
- movdqu $xt1,0x50($out)
- movdqu $xt2,0x60($out)
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # out+=64*4
-
- sub \$64*4,$len
- jnz .Loop_outer4x
-
- jmp .Ldone4x
-
-.Ltail4x:
- cmp \$192,$len
- jae .L192_or_more4x
- cmp \$128,$len
- jae .L128_or_more4x
- cmp \$64,$len
- jae .L64_or_more4x
-
- #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- xor %r9,%r9
- #movdqa $xt0,0x00(%rsp)
- movdqa $xb0,0x10(%rsp)
- movdqa $xc0,0x20(%rsp)
- movdqa $xd0,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L64_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
- movdqu $xt0,0x00($out)
- movdqu $xt1,0x10($out)
- movdqu $xt2,0x20($out)
- movdqu $xt3,0x30($out)
- je .Ldone4x
-
- movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x40($inp),$inp # inp+=64*1
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb1,0x10(%rsp)
- lea 0x40($out),$out # out+=64*1
- movdqa $xc1,0x20(%rsp)
- sub \$64,$len # len-=64*1
- movdqa $xd1,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L128_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
- movdqu $xt0,0x40($out)
- movdqu $xt1,0x50($out)
- movdqu $xt2,0x60($out)
- movdqu $xt3,0x70($out)
- je .Ldone4x
-
- movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x80($inp),$inp # inp+=64*2
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb2,0x10(%rsp)
- lea 0x80($out),$out # out+=64*2
- movdqa $xc2,0x20(%rsp)
- sub \$128,$len # len-=64*2
- movdqa $xd2,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L192_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # size optimization
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
-
- movdqu $xt0,0x40($out)
- movdqu 0x00($inp),$xt0
- movdqu $xt1,0x50($out)
- movdqu 0x10($inp),$xt1
- movdqu $xt2,0x60($out)
- movdqu 0x20($inp),$xt2
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # size optimization
- movdqu 0x30($inp),$xt3
- pxor 0x20(%rsp),$xt0
- pxor $xb2,$xt1
- pxor $xc2,$xt2
- pxor $xd2,$xt3
- movdqu $xt0,0x00($out)
- movdqu $xt1,0x10($out)
- movdqu $xt2,0x20($out)
- movdqu $xt3,0x30($out)
- je .Ldone4x
-
- movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x40($inp),$inp # inp+=64*3
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb3,0x10(%rsp)
- lea 0x40($out),$out # out+=64*3
- movdqa $xc3,0x20(%rsp)
- sub \$192,$len # len-=64*3
- movdqa $xd3,0x30(%rsp)
-
-.Loop_tail4x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail4x
-
-.Ldone4x:
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L4x_epilogue:
- ret
-.cfi_endproc
-.size chacha20_4x,.-chacha20_4x
-___
-}
-if($kernel) {
- $code .= "#endif\n";
-}
-
-########################################################################
-# XOP code path that handles all lengths.
-if ($avx && !$kernel) {
-# There is some "anomaly" observed depending on instructions' size or
-# alignment. If you look closely at below code you'll notice that
-# sometimes argument order varies. The order affects instruction
-# encoding by making it larger, and such fiddling gives 5% performance
-# improvement. This is on FX-4100...
-
-my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
- $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
-
-sub XOP_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vprotd (@x[$d0],@x[$d0],16)",
- "&vprotd (@x[$d1],@x[$d1],16)",
- "&vprotd (@x[$d2],@x[$d2],16)",
- "&vprotd (@x[$d3],@x[$d3],16)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
- "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
- "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
- "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
- "&vprotd (@x[$b0],@x[$b0],12)",
- "&vprotd (@x[$b1],@x[$b1],12)",
- "&vprotd (@x[$b2],@x[$b2],12)",
- "&vprotd (@x[$b3],@x[$b3],12)",
-
- "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
- "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vprotd (@x[$d0],@x[$d0],8)",
- "&vprotd (@x[$d1],@x[$d1],8)",
- "&vprotd (@x[$d2],@x[$d2],8)",
- "&vprotd (@x[$d3],@x[$d3],8)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
- "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
- "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
- "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
- "&vprotd (@x[$b0],@x[$b0],7)",
- "&vprotd (@x[$b1],@x[$b1],7)",
- "&vprotd (@x[$b2],@x[$b2],7)",
- "&vprotd (@x[$b3],@x[$b3],7)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-&declare_function("chacha20_xop", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_4xop:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- sub \$0x140+$xframe,%rsp
- and \$-16,%rsp
-___
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x40 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x100 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x140
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L4xop_body:
-___
-$code.=<<___;
- vzeroupper
-
- vmovdqa .Lsigma(%rip),$xa3 # key[0]
- vmovdqu ($key),$xb3 # key[1]
- vmovdqu 16($key),$xt3 # key[2]
- vmovdqu ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vmovdqa $xa0,0x40(%rsp) # ... and offload
- vpshufd \$0xaa,$xa3,$xa2
- vmovdqa $xa1,0x50(%rsp)
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa $xa2,0x60(%rsp)
- vmovdqa $xa3,0x70(%rsp)
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vmovdqa $xb0,0x80-0x100(%rcx)
- vpshufd \$0xaa,$xb3,$xb2
- vmovdqa $xb1,0x90-0x100(%rcx)
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa $xb2,0xa0-0x100(%rcx)
- vmovdqa $xb3,0xb0-0x100(%rcx)
-
- vpshufd \$0x00,$xt3,$xt0 # "$xc0"
- vpshufd \$0x55,$xt3,$xt1 # "$xc1"
- vmovdqa $xt0,0xc0-0x100(%rcx)
- vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
- vmovdqa $xt1,0xd0-0x100(%rcx)
- vpshufd \$0xff,$xt3,$xt3 # "$xc3"
- vmovdqa $xt2,0xe0-0x100(%rcx)
- vmovdqa $xt3,0xf0-0x100(%rcx)
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
- vpshufd \$0xaa,$xd3,$xd2
- vmovdqa $xd1,0x110-0x100(%rcx)
- vpshufd \$0xff,$xd3,$xd3
- vmovdqa $xd2,0x120-0x100(%rcx)
- vmovdqa $xd3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4xop
-
-.align 32
-.Loop_outer4xop:
- vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
- vmovdqa 0x50(%rsp),$xa1
- vmovdqa 0x60(%rsp),$xa2
- vmovdqa 0x70(%rsp),$xa3
- vmovdqa 0x80-0x100(%rcx),$xb0
- vmovdqa 0x90-0x100(%rcx),$xb1
- vmovdqa 0xa0-0x100(%rcx),$xb2
- vmovdqa 0xb0-0x100(%rcx),$xb3
- vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
- vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
- vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
- vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
- vmovdqa 0x100-0x100(%rcx),$xd0
- vmovdqa 0x110-0x100(%rcx),$xd1
- vmovdqa 0x120-0x100(%rcx),$xd2
- vmovdqa 0x130-0x100(%rcx),$xd3
- vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
-
-.Loop_enter4xop:
- mov \$10,%eax
- vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4xop
-
-.align 32
-.Loop4xop:
-___
- foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop4xop
-
- vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
- vpaddd 0x50(%rsp),$xa1,$xa1
- vpaddd 0x60(%rsp),$xa2,$xa2
- vpaddd 0x70(%rsp),$xa3,$xa3
-
- vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
- vmovdqa $xt3,0x30(%rsp)
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd 0x80-0x100(%rcx),$xb0,$xb0
- vpaddd 0x90-0x100(%rcx),$xb1,$xb1
- vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
- vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
-
- vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
- vmovdqa $xa1,0x10(%rsp)
- vmovdqa 0x20(%rsp),$xa0 # "xc2"
- vmovdqa 0x30(%rsp),$xa1 # "xc3"
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
- vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
- vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
- vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd 0x100-0x100(%rcx),$xd0,$xd0
- vpaddd 0x110-0x100(%rcx),$xd1,$xd1
- vpaddd 0x120-0x100(%rcx),$xd2,$xd2
- vpaddd 0x130-0x100(%rcx),$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
- ($xa0,$xa1)=($xt2,$xt3);
-$code.=<<___;
- vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
- vmovdqa 0x10(%rsp),$xa1
-
- cmp \$64*4,$len
- jb .Ltail4xop
-
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x10($inp),$xb2,$xb2
- vpxor 0x20($inp),$xc2,$xc2
- vpxor 0x30($inp),$xd2,$xd2
- vpxor 0x40($inp),$xa3,$xa3
- vpxor 0x50($inp),$xb3,$xb3
- vpxor 0x60($inp),$xc3,$xc3
- vpxor 0x70($inp),$xd3,$xd3
- lea 0x80($inp),$inp # inp+=64*4
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- lea 0x80($out),$out # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x10($out)
- vmovdqu $xc2,0x20($out)
- vmovdqu $xd2,0x30($out)
- vmovdqu $xa3,0x40($out)
- vmovdqu $xb3,0x50($out)
- vmovdqu $xc3,0x60($out)
- vmovdqu $xd3,0x70($out)
- lea 0x80($out),$out # out+=64*4
-
- sub \$64*4,$len
- jnz .Loop_outer4xop
-
- jmp .Ldone4xop
-
-.align 32
-.Ltail4xop:
- cmp \$192,$len
- jae .L192_or_more4xop
- cmp \$128,$len
- jae .L128_or_more4xop
- cmp \$64,$len
- jae .L64_or_more4xop
-
- xor %r9,%r9
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x10(%rsp)
- vmovdqa $xc0,0x20(%rsp)
- vmovdqa $xd0,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L64_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- je .Ldone4xop
-
- lea 0x40($inp),$inp # inp+=64*1
- vmovdqa $xa1,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb1,0x10(%rsp)
- lea 0x40($out),$out # out+=64*1
- vmovdqa $xc1,0x20(%rsp)
- sub \$64,$len # len-=64*1
- vmovdqa $xd1,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L128_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- je .Ldone4xop
-
- lea 0x80($inp),$inp # inp+=64*2
- vmovdqa $xa2,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb2,0x10(%rsp)
- lea 0x80($out),$out # out+=64*2
- vmovdqa $xc2,0x20(%rsp)
- sub \$128,$len # len-=64*2
- vmovdqa $xd2,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L192_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x10($inp),$xb2,$xb2
- vpxor 0x20($inp),$xc2,$xc2
- vpxor 0x30($inp),$xd2,$xd2
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- lea 0x80($out),$out # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x10($out)
- vmovdqu $xc2,0x20($out)
- vmovdqu $xd2,0x30($out)
- je .Ldone4xop
-
- lea 0x40($inp),$inp # inp+=64*3
- vmovdqa $xa3,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb3,0x10(%rsp)
- lea 0x40($out),$out # out+=64*3
- vmovdqa $xc3,0x20(%rsp)
- sub \$192,$len # len-=64*3
- vmovdqa $xd3,0x30(%rsp)
-
-.Loop_tail4xop:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail4xop
-
-.Ldone4xop:
- vzeroupper
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L4xop_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_xop");
-}
-
-########################################################################
-# AVX2 code path
-if ($avx>1) {
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX2\n";
-}
-
-my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
- $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
-
-sub AVX2_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
-my @x=map("\"$_\"",@xx);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpshufb (@x[$d0],@x[$d0],$t1)",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpshufb (@x[$d1],@x[$d1],$t1)",
-
- "&vpaddd ($xc,$xc,@x[$d0])",
- "&vpxor (@x[$b0],$xc,@x[$b0])",
- "&vpslld ($t0,@x[$b0],12)",
- "&vpsrld (@x[$b0],@x[$b0],20)",
- "&vpor (@x[$b0],$t0,@x[$b0])",
- "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d1])",
- "&vpxor (@x[$b1],$xc_,@x[$b1])",
- "&vpslld ($t1,@x[$b1],12)",
- "&vpsrld (@x[$b1],@x[$b1],20)",
- "&vpor (@x[$b1],$t1,@x[$b1])",
-
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpshufb (@x[$d0],@x[$d0],$t0)",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpshufb (@x[$d1],@x[$d1],$t0)",
-
- "&vpaddd ($xc,$xc,@x[$d0])",
- "&vpxor (@x[$b0],$xc,@x[$b0])",
- "&vpslld ($t1,@x[$b0],7)",
- "&vpsrld (@x[$b0],@x[$b0],25)",
- "&vpor (@x[$b0],$t1,@x[$b0])",
- "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d1])",
- "&vpxor (@x[$b1],$xc_,@x[$b1])",
- "&vpslld ($t0,@x[$b1],7)",
- "&vpsrld (@x[$b1],@x[$b1],25)",
- "&vpor (@x[$b1],$t0,@x[$b1])",
-
- "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
- "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
- "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
- "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
-
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpshufb (@x[$d2],@x[$d2],$t1)",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vpshufb (@x[$d3],@x[$d3],$t1)",
-
- "&vpaddd ($xc,$xc,@x[$d2])",
- "&vpxor (@x[$b2],$xc,@x[$b2])",
- "&vpslld ($t0,@x[$b2],12)",
- "&vpsrld (@x[$b2],@x[$b2],20)",
- "&vpor (@x[$b2],$t0,@x[$b2])",
- "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d3])",
- "&vpxor (@x[$b3],$xc_,@x[$b3])",
- "&vpslld ($t1,@x[$b3],12)",
- "&vpsrld (@x[$b3],@x[$b3],20)",
- "&vpor (@x[$b3],$t1,@x[$b3])",
-
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpshufb (@x[$d2],@x[$d2],$t0)",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vpshufb (@x[$d3],@x[$d3],$t0)",
-
- "&vpaddd ($xc,$xc,@x[$d2])",
- "&vpxor (@x[$b2],$xc,@x[$b2])",
- "&vpslld ($t1,@x[$b2],7)",
- "&vpsrld (@x[$b2],@x[$b2],25)",
- "&vpor (@x[$b2],$t1,@x[$b2])",
- "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d3])",
- "&vpxor (@x[$b3],$xc_,@x[$b3])",
- "&vpslld ($t0,@x[$b3],7)",
- "&vpsrld (@x[$b3],@x[$b3],25)",
- "&vpor (@x[$b3],$t0,@x[$b3])"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-&declare_function("chacha20_avx2", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_8x:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$0x280+$xframe,%rsp
- and \$-32,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L8x_body:
-___
-$code.=<<___;
- vzeroupper
-
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x80 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x200 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x280
-
- vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
- vbroadcasti128 ($key),$xb3 # key[1]
- vbroadcasti128 16($key),$xt3 # key[2]
- vbroadcasti128 ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea 0x200(%rsp),%rax # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
- vpshufd \$0xaa,$xa3,$xa2
- vmovdqa $xa1,0xa0-0x100(%rcx)
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa $xa2,0xc0-0x100(%rcx)
- vmovdqa $xa3,0xe0-0x100(%rcx)
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vmovdqa $xb0,0x100-0x100(%rcx)
- vpshufd \$0xaa,$xb3,$xb2
- vmovdqa $xb1,0x120-0x100(%rcx)
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa $xb2,0x140-0x100(%rcx)
- vmovdqa $xb3,0x160-0x100(%rcx)
-
- vpshufd \$0x00,$xt3,$xt0 # "xc0"
- vpshufd \$0x55,$xt3,$xt1 # "xc1"
- vmovdqa $xt0,0x180-0x200(%rax)
- vpshufd \$0xaa,$xt3,$xt2 # "xc2"
- vmovdqa $xt1,0x1a0-0x200(%rax)
- vpshufd \$0xff,$xt3,$xt3 # "xc3"
- vmovdqa $xt2,0x1c0-0x200(%rax)
- vmovdqa $xt3,0x1e0-0x200(%rax)
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
- vpshufd \$0xaa,$xd3,$xd2
- vmovdqa $xd1,0x220-0x200(%rax)
- vpshufd \$0xff,$xd3,$xd3
- vmovdqa $xd2,0x240-0x200(%rax)
- vmovdqa $xd3,0x260-0x200(%rax)
-
- jmp .Loop_enter8x
-
-.align 32
-.Loop_outer8x:
- vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
- vmovdqa 0xa0-0x100(%rcx),$xa1
- vmovdqa 0xc0-0x100(%rcx),$xa2
- vmovdqa 0xe0-0x100(%rcx),$xa3
- vmovdqa 0x100-0x100(%rcx),$xb0
- vmovdqa 0x120-0x100(%rcx),$xb1
- vmovdqa 0x140-0x100(%rcx),$xb2
- vmovdqa 0x160-0x100(%rcx),$xb3
- vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
- vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
- vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
- vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
- vmovdqa 0x200-0x200(%rax),$xd0
- vmovdqa 0x220-0x200(%rax),$xd1
- vmovdqa 0x240-0x200(%rax),$xd2
- vmovdqa 0x260-0x200(%rax),$xd3
- vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
-
-.Loop_enter8x:
- vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
- vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
- vbroadcasti128 (%r9),$xt3
- vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
- mov \$10,%eax
- jmp .Loop8x
-
-.align 32
-.Loop8x:
-___
- foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop8x
-
- lea 0x200(%rsp),%rax # size optimization
- vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
- vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
- vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
- vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd 0x100-0x100(%rcx),$xb0,$xb0
- vpaddd 0x120-0x100(%rcx),$xb1,$xb1
- vpaddd 0x140-0x100(%rcx),$xb2,$xb2
- vpaddd 0x160-0x100(%rcx),$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xb0,$xa0,$xb0
- vperm2i128 \$0x20,$xb1,$xa1,$xa0
- vperm2i128 \$0x31,$xb1,$xa1,$xb1
- vperm2i128 \$0x20,$xb2,$xa2,$xa1
- vperm2i128 \$0x31,$xb2,$xa2,$xb2
- vperm2i128 \$0x20,$xb3,$xa3,$xa2
- vperm2i128 \$0x31,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- vmovdqa $xa0,0x00(%rsp) # offload $xaN
- vmovdqa $xa1,0x20(%rsp)
- vmovdqa 0x40(%rsp),$xc2 # $xa0
- vmovdqa 0x60(%rsp),$xc3 # $xa1
-
- vpaddd 0x180-0x200(%rax),$xc0,$xc0
- vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
- vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
- vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd 0x200-0x200(%rax),$xd0,$xd0
- vpaddd 0x220-0x200(%rax),$xd1,$xd1
- vpaddd 0x240-0x200(%rax),$xd2,$xd2
- vpaddd 0x260-0x200(%rax),$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xd0,$xc0,$xd0
- vperm2i128 \$0x20,$xd1,$xc1,$xc0
- vperm2i128 \$0x31,$xd1,$xc1,$xd1
- vperm2i128 \$0x20,$xd2,$xc2,$xc1
- vperm2i128 \$0x31,$xd2,$xc2,$xd2
- vperm2i128 \$0x20,$xd3,$xc3,$xc2
- vperm2i128 \$0x31,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
- ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
- ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
- ($xa0,$xa1)=($xt2,$xt3);
-$code.=<<___;
- vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
- vmovdqa 0x20(%rsp),$xa1
-
- cmp \$64*8,$len
- jb .Ltail8x
-
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vpxor 0x40($inp),$xc1,$xc1
- vpxor 0x60($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa1,0x00($out)
- vmovdqu $xb1,0x20($out)
- vmovdqu $xc1,0x40($out)
- vmovdqu $xd1,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vpxor 0x40($inp),$xc2,$xc2
- vpxor 0x60($inp),$xd2,$xd2
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x20($out)
- vmovdqu $xc2,0x40($out)
- vmovdqu $xd2,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vpxor 0x40($inp),$xc3,$xc3
- vpxor 0x60($inp),$xd3,$xd3
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa3,0x00($out)
- vmovdqu $xb3,0x20($out)
- vmovdqu $xc3,0x40($out)
- vmovdqu $xd3,0x60($out)
- lea 0x80($out),$out # size optimization
-
- sub \$64*8,$len
- jnz .Loop_outer8x
-
- jmp .Ldone8x
-
-.Ltail8x:
- cmp \$448,$len
- jae .L448_or_more8x
- cmp \$384,$len
- jae .L384_or_more8x
- cmp \$320,$len
- jae .L320_or_more8x
- cmp \$256,$len
- jae .L256_or_more8x
- cmp \$192,$len
- jae .L192_or_more8x
- cmp \$128,$len
- jae .L128_or_more8x
- cmp \$64,$len
- jae .L64_or_more8x
-
- xor %r9,%r9
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L64_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- je .Ldone8x
-
- lea 0x40($inp),$inp # inp+=64*1
- xor %r9,%r9
- vmovdqa $xc0,0x00(%rsp)
- lea 0x40($out),$out # out+=64*1
- sub \$64,$len # len-=64*1
- vmovdqa $xd0,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L128_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- je .Ldone8x
-
- lea 0x80($inp),$inp # inp+=64*2
- xor %r9,%r9
- vmovdqa $xa1,0x00(%rsp)
- lea 0x80($out),$out # out+=64*2
- sub \$128,$len # len-=64*2
- vmovdqa $xb1,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L192_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- je .Ldone8x
-
- lea 0xc0($inp),$inp # inp+=64*3
- xor %r9,%r9
- vmovdqa $xc1,0x00(%rsp)
- lea 0xc0($out),$out # out+=64*3
- sub \$192,$len # len-=64*3
- vmovdqa $xd1,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L256_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- je .Ldone8x
-
- lea 0x100($inp),$inp # inp+=64*4
- xor %r9,%r9
- vmovdqa $xa2,0x00(%rsp)
- lea 0x100($out),$out # out+=64*4
- sub \$256,$len # len-=64*4
- vmovdqa $xb2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L320_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- je .Ldone8x
-
- lea 0x140($inp),$inp # inp+=64*5
- xor %r9,%r9
- vmovdqa $xc2,0x00(%rsp)
- lea 0x140($out),$out # out+=64*5
- sub \$320,$len # len-=64*5
- vmovdqa $xd2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L384_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vpxor 0x140($inp),$xc2,$xc2
- vpxor 0x160($inp),$xd2,$xd2
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- vmovdqu $xc2,0x140($out)
- vmovdqu $xd2,0x160($out)
- je .Ldone8x
-
- lea 0x180($inp),$inp # inp+=64*6
- xor %r9,%r9
- vmovdqa $xa3,0x00(%rsp)
- lea 0x180($out),$out # out+=64*6
- sub \$384,$len # len-=64*6
- vmovdqa $xb3,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L448_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vpxor 0x140($inp),$xc2,$xc2
- vpxor 0x160($inp),$xd2,$xd2
- vpxor 0x180($inp),$xa3,$xa3
- vpxor 0x1a0($inp),$xb3,$xb3
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- vmovdqu $xc2,0x140($out)
- vmovdqu $xd2,0x160($out)
- vmovdqu $xa3,0x180($out)
- vmovdqu $xb3,0x1a0($out)
- je .Ldone8x
-
- lea 0x1c0($inp),$inp # inp+=64*7
- xor %r9,%r9
- vmovdqa $xc3,0x00(%rsp)
- lea 0x1c0($out),$out # out+=64*7
- sub \$448,$len # len-=64*7
- vmovdqa $xd3,0x20(%rsp)
-
-.Loop_tail8x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail8x
-
-.Ldone8x:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L8x_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx2");
-if($kernel) {
- $code .= "#endif\n";
-}
-}
-
-########################################################################
-# AVX512 code paths
-if ($avx>2) {
-# This one handles shorter inputs...
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX512\n";
-}
-
-my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
-my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
-
-sub vpxord() # size optimization
-{ my $opcode = "vpxor"; # adhere to vpxor when possible
-
- foreach (@_) {
- if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
- $opcode = "vpxord";
- last;
- }
- }
-
- $code .= "\t$opcode\t".join(',',reverse @_)."\n";
-}
-
-sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
- &vpaddd ($a,$a,$b);
- &vpxord ($d,$d,$a);
- &vprold ($d,$d,16);
-
- &vpaddd ($c,$c,$d);
- &vpxord ($b,$b,$c);
- &vprold ($b,$b,12);
-
- &vpaddd ($a,$a,$b);
- &vpxord ($d,$d,$a);
- &vprold ($d,$d,8);
-
- &vpaddd ($c,$c,$d);
- &vpxord ($b,$b,$c);
- &vprold ($b,$b,7);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-&declare_function("chacha20_avx512", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_avx512:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- cmp \$512,$len
- ja .Lchacha20_16x
-
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lavx512_body:
-___
-$code.=<<___;
- vbroadcasti32x4 .Lsigma(%rip),$a
- vbroadcasti32x4 ($key),$b
- vbroadcasti32x4 16($key),$c
- vbroadcasti32x4 ($counter),$d
-
- vmovdqa32 $a,$a_
- vmovdqa32 $b,$b_
- vmovdqa32 $c,$c_
- vpaddd .Lzeroz(%rip),$d,$d
- vmovdqa32 .Lfourz(%rip),$fourz
- mov \$10,$counter # reuse $counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512
-
-.align 16
-.Loop_outer_avx512:
- vmovdqa32 $a_,$a
- vmovdqa32 $b_,$b
- vmovdqa32 $c_,$c
- vpaddd $fourz,$d_,$d
- mov \$10,$counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
-___
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b00111001);
- &vpshufd ($d,$d,0b10010011);
-
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b10010011);
- &vpshufd ($d,$d,0b00111001);
-
- &dec ($counter);
- &jnz (".Loop_avx512");
-
-$code.=<<___;
- vpaddd $a_,$a,$a
- vpaddd $b_,$b,$b
- vpaddd $c_,$c,$c
- vpaddd $d_,$d,$d
-
- sub \$64,$len
- jb .Ltail64_avx512
-
- vpxor 0x00($inp),%x#$a,$t0 # xor with input
- vpxor 0x10($inp),%x#$b,$t1
- vpxor 0x20($inp),%x#$c,$t2
- vpxor 0x30($inp),%x#$d,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$1,$a,$t0
- vextracti32x4 \$1,$b,$t1
- vextracti32x4 \$1,$c,$t2
- vextracti32x4 \$1,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$2,$a,$t0
- vextracti32x4 \$2,$b,$t1
- vextracti32x4 \$2,$c,$t2
- vextracti32x4 \$2,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$3,$a,$t0
- vextracti32x4 \$3,$b,$t1
- vextracti32x4 \$3,$c,$t2
- vextracti32x4 \$3,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jnz .Loop_outer_avx512
-
- jmp .Ldone_avx512
-
-.align 16
-.Ltail64_avx512:
- vmovdqa %x#$a,0x00(%rsp)
- vmovdqa %x#$b,0x10(%rsp)
- vmovdqa %x#$c,0x20(%rsp)
- vmovdqa %x#$d,0x30(%rsp)
- add \$64,$len
- jmp .Loop_tail_avx512
-
-.align 16
-.Ltail_avx512:
- vmovdqa $t0,0x00(%rsp)
- vmovdqa $t1,0x10(%rsp)
- vmovdqa $t2,0x20(%rsp)
- vmovdqa $t3,0x30(%rsp)
- add \$64,$len
-
-.Loop_tail_avx512:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_avx512
-
- vmovdqu32 $a_,0x00(%rsp)
-
-.Ldone_avx512:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lavx512_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx512");
-
-map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
-
-&declare_function("chacha20_avx512vl", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_avx512vl:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- cmp \$128,$len
- ja .Lchacha20_8xvl
-
- sub \$64+$xframe,%rsp
- and \$-32,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lavx512vl_body:
-___
-$code.=<<___;
- vbroadcasti128 .Lsigma(%rip),$a
- vbroadcasti128 ($key),$b
- vbroadcasti128 16($key),$c
- vbroadcasti128 ($counter),$d
-
- vmovdqa32 $a,$a_
- vmovdqa32 $b,$b_
- vmovdqa32 $c,$c_
- vpaddd .Lzeroz(%rip),$d,$d
- vmovdqa32 .Ltwoy(%rip),$fourz
- mov \$10,$counter # reuse $counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512vl
-
-.align 16
-.Loop_outer_avx512vl:
- vmovdqa32 $c_,$c
- vpaddd $fourz,$d_,$d
- mov \$10,$counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512vl
-
-.align 32
-.Loop_avx512vl:
-___
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b00111001);
- &vpshufd ($d,$d,0b10010011);
-
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b10010011);
- &vpshufd ($d,$d,0b00111001);
-
- &dec ($counter);
- &jnz (".Loop_avx512vl");
-
-$code.=<<___;
- vpaddd $a_,$a,$a
- vpaddd $b_,$b,$b
- vpaddd $c_,$c,$c
- vpaddd $d_,$d,$d
-
- sub \$64,$len
- jb .Ltail64_avx512vl
-
- vpxor 0x00($inp),%x#$a,$t0 # xor with input
- vpxor 0x10($inp),%x#$b,$t1
- vpxor 0x20($inp),%x#$c,$t2
- vpxor 0x30($inp),%x#$d,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512vl
-
- vextracti128 \$1,$a,$t0
- vextracti128 \$1,$b,$t1
- vextracti128 \$1,$c,$t2
- vextracti128 \$1,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512vl
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- vmovdqa32 $a_,$a
- vmovdqa32 $b_,$b
- jnz .Loop_outer_avx512vl
-
- jmp .Ldone_avx512vl
-
-.align 16
-.Ltail64_avx512vl:
- vmovdqa %x#$a,0x00(%rsp)
- vmovdqa %x#$b,0x10(%rsp)
- vmovdqa %x#$c,0x20(%rsp)
- vmovdqa %x#$d,0x30(%rsp)
- add \$64,$len
- jmp .Loop_tail_avx512vl
-
-.align 16
-.Ltail_avx512vl:
- vmovdqa $t0,0x00(%rsp)
- vmovdqa $t1,0x10(%rsp)
- vmovdqa $t2,0x20(%rsp)
- vmovdqa $t3,0x30(%rsp)
- add \$64,$len
-
-.Loop_tail_avx512vl:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_avx512vl
-
- vmovdqu32 $a_,0x00(%rsp)
- vmovdqu32 $a_,0x20(%rsp)
-
-.Ldone_avx512vl:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lavx512vl_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx512vl");
-
-# This one handles longer inputs...
-
-my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-my @key=map("%zmm$_",(16..31));
-my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-sub AVX512_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
- "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
- "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
- "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
- "&vprold (@x[$d0],@x[$d0],16)",
- "&vprold (@x[$d1],@x[$d1],16)",
- "&vprold (@x[$d2],@x[$d2],16)",
- "&vprold (@x[$d3],@x[$d3],16)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
- "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
- "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
- "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
- "&vprold (@x[$b0],@x[$b0],12)",
- "&vprold (@x[$b1],@x[$b1],12)",
- "&vprold (@x[$b2],@x[$b2],12)",
- "&vprold (@x[$b3],@x[$b3],12)",
-
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
- "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
- "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
- "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
- "&vprold (@x[$d0],@x[$d0],8)",
- "&vprold (@x[$d1],@x[$d1],8)",
- "&vprold (@x[$d2],@x[$d2],8)",
- "&vprold (@x[$d3],@x[$d3],8)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
- "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
- "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
- "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
- "&vprold (@x[$b0],@x[$b0],7)",
- "&vprold (@x[$b1],@x[$b1],7)",
- "&vprold (@x[$b2],@x[$b2],7)",
- "&vprold (@x[$b3],@x[$b3],7)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type chacha20_16x,\@function,5
-.align 32
-chacha20_16x:
-.cfi_startproc
-.Lchacha20_16x:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L16x_body:
-___
-$code.=<<___;
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti32x4 (%r9),$xa3 # key[0]
- vbroadcasti32x4 ($key),$xb3 # key[1]
- vbroadcasti32x4 16($key),$xc3 # key[2]
- vbroadcasti32x4 ($counter),$xd3 # key[3]
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vpshufd \$0xaa,$xa3,$xa2
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vpshufd \$0xaa,$xb3,$xb2
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa64 $xb0,@key[4]
- vmovdqa64 $xb1,@key[5]
- vmovdqa64 $xb2,@key[6]
- vmovdqa64 $xb3,@key[7]
-
- vpshufd \$0x00,$xc3,$xc0
- vpshufd \$0x55,$xc3,$xc1
- vpshufd \$0xaa,$xc3,$xc2
- vpshufd \$0xff,$xc3,$xc3
- vmovdqa64 $xc0,@key[8]
- vmovdqa64 $xc1,@key[9]
- vmovdqa64 $xc2,@key[10]
- vmovdqa64 $xc3,@key[11]
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpshufd \$0xaa,$xd3,$xd2
- vpshufd \$0xff,$xd3,$xd3
- vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
- vmovdqa64 $xd0,@key[12]
- vmovdqa64 $xd1,@key[13]
- vmovdqa64 $xd2,@key[14]
- vmovdqa64 $xd3,@key[15]
-
- mov \$10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop_outer16x:
- vpbroadcastd 0(%r9),$xa0 # reload key
- vpbroadcastd 4(%r9),$xa1
- vpbroadcastd 8(%r9),$xa2
- vpbroadcastd 12(%r9),$xa3
- vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
- vmovdqa64 @key[4],$xb0
- vmovdqa64 @key[5],$xb1
- vmovdqa64 @key[6],$xb2
- vmovdqa64 @key[7],$xb3
- vmovdqa64 @key[8],$xc0
- vmovdqa64 @key[9],$xc1
- vmovdqa64 @key[10],$xc2
- vmovdqa64 @key[11],$xc3
- vmovdqa64 @key[12],$xd0
- vmovdqa64 @key[13],$xd1
- vmovdqa64 @key[14],$xd2
- vmovdqa64 @key[15],$xd3
-
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- mov \$10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop16x:
-___
- foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop16x
-
- vpaddd @key[0],$xa0,$xa0 # accumulate key
- vpaddd @key[1],$xa1,$xa1
- vpaddd @key[2],$xa2,$xa2
- vpaddd @key[3],$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd @key[4],$xb0,$xb0
- vpaddd @key[5],$xb1,$xb1
- vpaddd @key[6],$xb2,$xb2
- vpaddd @key[7],$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
- vshufi32x4 \$0xee,$xb0,$xa0,$xb0
- vshufi32x4 \$0x44,$xb1,$xa1,$xa0
- vshufi32x4 \$0xee,$xb1,$xa1,$xb1
- vshufi32x4 \$0x44,$xb2,$xa2,$xa1
- vshufi32x4 \$0xee,$xb2,$xa2,$xb2
- vshufi32x4 \$0x44,$xb3,$xa3,$xa2
- vshufi32x4 \$0xee,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
- vpaddd @key[8],$xc0,$xc0
- vpaddd @key[9],$xc1,$xc1
- vpaddd @key[10],$xc2,$xc2
- vpaddd @key[11],$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd @key[12],$xd0,$xd0
- vpaddd @key[13],$xd1,$xd1
- vpaddd @key[14],$xd2,$xd2
- vpaddd @key[15],$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
- vshufi32x4 \$0xee,$xd0,$xc0,$xd0
- vshufi32x4 \$0x44,$xd1,$xc1,$xc0
- vshufi32x4 \$0xee,$xd1,$xc1,$xd1
- vshufi32x4 \$0x44,$xd2,$xc2,$xc1
- vshufi32x4 \$0xee,$xd2,$xc2,$xd2
- vshufi32x4 \$0x44,$xd3,$xc3,$xc2
- vshufi32x4 \$0xee,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
-$code.=<<___;
- vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
- vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
- vshufi32x4 \$0x88,$xd0,$xb0,$xc0
- vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
- vshufi32x4 \$0x88,$xc1,$xa1,$xt1
- vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
- vshufi32x4 \$0x88,$xd1,$xb1,$xc1
- vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
- vshufi32x4 \$0x88,$xc2,$xa2,$xt2
- vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
- vshufi32x4 \$0x88,$xd2,$xb2,$xc2
- vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
- vshufi32x4 \$0x88,$xc3,$xa3,$xt3
- vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
- vshufi32x4 \$0x88,$xd3,$xb3,$xc3
- vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
-___
- ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
- ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
-
- ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
- $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
- ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-$code.=<<___;
- cmp \$64*16,$len
- jb .Ltail16x
-
- vpxord 0x00($inp),$xa0,$xa0 # xor with input
- vpxord 0x40($inp),$xb0,$xb0
- vpxord 0x80($inp),$xc0,$xc0
- vpxord 0xc0($inp),$xd0,$xd0
- vmovdqu32 $xa0,0x00($out)
- vmovdqu32 $xb0,0x40($out)
- vmovdqu32 $xc0,0x80($out)
- vmovdqu32 $xd0,0xc0($out)
-
- vpxord 0x100($inp),$xa1,$xa1
- vpxord 0x140($inp),$xb1,$xb1
- vpxord 0x180($inp),$xc1,$xc1
- vpxord 0x1c0($inp),$xd1,$xd1
- vmovdqu32 $xa1,0x100($out)
- vmovdqu32 $xb1,0x140($out)
- vmovdqu32 $xc1,0x180($out)
- vmovdqu32 $xd1,0x1c0($out)
-
- vpxord 0x200($inp),$xa2,$xa2
- vpxord 0x240($inp),$xb2,$xb2
- vpxord 0x280($inp),$xc2,$xc2
- vpxord 0x2c0($inp),$xd2,$xd2
- vmovdqu32 $xa2,0x200($out)
- vmovdqu32 $xb2,0x240($out)
- vmovdqu32 $xc2,0x280($out)
- vmovdqu32 $xd2,0x2c0($out)
-
- vpxord 0x300($inp),$xa3,$xa3
- vpxord 0x340($inp),$xb3,$xb3
- vpxord 0x380($inp),$xc3,$xc3
- vpxord 0x3c0($inp),$xd3,$xd3
- lea 0x400($inp),$inp
- vmovdqu32 $xa3,0x300($out)
- vmovdqu32 $xb3,0x340($out)
- vmovdqu32 $xc3,0x380($out)
- vmovdqu32 $xd3,0x3c0($out)
- lea 0x400($out),$out
-
- sub \$64*16,$len
- jnz .Loop_outer16x
-
- jmp .Ldone16x
-
-.align 32
-.Ltail16x:
- xor %r9,%r9
- sub $inp,$out
- cmp \$64*1,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa0,$xa0 # xor with input
- vmovdqu32 $xa0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*2,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb0,$xb0
- vmovdqu32 $xb0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*3,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc0,$xc0
- vmovdqu32 $xc0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*4,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd0,$xd0
- vmovdqu32 $xd0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*5,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa1,$xa1
- vmovdqu32 $xa1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*6,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb1,$xb1
- vmovdqu32 $xb1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*7,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc1,$xc1
- vmovdqu32 $xc1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*8,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd1,$xd1
- vmovdqu32 $xd1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*9,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa2,$xa2
- vmovdqu32 $xa2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*10,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb2,$xb2
- vmovdqu32 $xb2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*11,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc2,$xc2
- vmovdqu32 $xc2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*12,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd2,$xd2
- vmovdqu32 $xd2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*13,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa3,$xa3
- vmovdqu32 $xa3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*14,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb3,$xb3
- vmovdqu32 $xb3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*15,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc3,$xc3
- vmovdqu32 $xc3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd3,$xa0
- lea 64($inp),$inp
-
-.Less_than_64_16x:
- vmovdqa32 $xa0,0x00(%rsp)
- lea ($out,$inp),$out
- and \$63,$len
-
-.Loop_tail16x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail16x
-
- vpxord $xa0,$xa0,$xa0
- vmovdqa32 $xa0,0(%rsp)
-
-.Ldone16x:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L16x_epilogue:
- ret
-.cfi_endproc
-.size chacha20_16x,.-chacha20_16x
-___
-
-# switch to %ymm domain
-($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
-@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-@key=map("%ymm$_",(16..31));
-($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-$code.=<<___;
-.type chacha20_8xvl,\@function,5
-.align 32
-chacha20_8xvl:
-.cfi_startproc
-.Lchacha20_8xvl:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L8xvl_body:
-___
-$code.=<<___;
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti128 (%r9),$xa3 # key[0]
- vbroadcasti128 ($key),$xb3 # key[1]
- vbroadcasti128 16($key),$xc3 # key[2]
- vbroadcasti128 ($counter),$xd3 # key[3]
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vpshufd \$0xaa,$xa3,$xa2
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vpshufd \$0xaa,$xb3,$xb2
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa64 $xb0,@key[4]
- vmovdqa64 $xb1,@key[5]
- vmovdqa64 $xb2,@key[6]
- vmovdqa64 $xb3,@key[7]
-
- vpshufd \$0x00,$xc3,$xc0
- vpshufd \$0x55,$xc3,$xc1
- vpshufd \$0xaa,$xc3,$xc2
- vpshufd \$0xff,$xc3,$xc3
- vmovdqa64 $xc0,@key[8]
- vmovdqa64 $xc1,@key[9]
- vmovdqa64 $xc2,@key[10]
- vmovdqa64 $xc3,@key[11]
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpshufd \$0xaa,$xd3,$xd2
- vpshufd \$0xff,$xd3,$xd3
- vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
- vmovdqa64 $xd0,@key[12]
- vmovdqa64 $xd1,@key[13]
- vmovdqa64 $xd2,@key[14]
- vmovdqa64 $xd3,@key[15]
-
- mov \$10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop_outer8xvl:
- #vpbroadcastd 0(%r9),$xa0 # reload key
- #vpbroadcastd 4(%r9),$xa1
- vpbroadcastd 8(%r9),$xa2
- vpbroadcastd 12(%r9),$xa3
- vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
- vmovdqa64 @key[4],$xb0
- vmovdqa64 @key[5],$xb1
- vmovdqa64 @key[6],$xb2
- vmovdqa64 @key[7],$xb3
- vmovdqa64 @key[8],$xc0
- vmovdqa64 @key[9],$xc1
- vmovdqa64 @key[10],$xc2
- vmovdqa64 @key[11],$xc3
- vmovdqa64 @key[12],$xd0
- vmovdqa64 @key[13],$xd1
- vmovdqa64 @key[14],$xd2
- vmovdqa64 @key[15],$xd3
-
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- mov \$10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop8xvl:
-___
- foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop8xvl
-
- vpaddd @key[0],$xa0,$xa0 # accumulate key
- vpaddd @key[1],$xa1,$xa1
- vpaddd @key[2],$xa2,$xa2
- vpaddd @key[3],$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd @key[4],$xb0,$xb0
- vpaddd @key[5],$xb1,$xb1
- vpaddd @key[6],$xb2,$xb2
- vpaddd @key[7],$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
- vshufi32x4 \$3,$xb0,$xa0,$xb0
- vshufi32x4 \$0,$xb1,$xa1,$xa0
- vshufi32x4 \$3,$xb1,$xa1,$xb1
- vshufi32x4 \$0,$xb2,$xa2,$xa1
- vshufi32x4 \$3,$xb2,$xa2,$xb2
- vshufi32x4 \$0,$xb3,$xa3,$xa2
- vshufi32x4 \$3,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
- vpaddd @key[8],$xc0,$xc0
- vpaddd @key[9],$xc1,$xc1
- vpaddd @key[10],$xc2,$xc2
- vpaddd @key[11],$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd @key[12],$xd0,$xd0
- vpaddd @key[13],$xd1,$xd1
- vpaddd @key[14],$xd2,$xd2
- vpaddd @key[15],$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xd0,$xc0,$xd0
- vperm2i128 \$0x20,$xd1,$xc1,$xc0
- vperm2i128 \$0x31,$xd1,$xc1,$xd1
- vperm2i128 \$0x20,$xd2,$xc2,$xc1
- vperm2i128 \$0x31,$xd2,$xc2,$xd2
- vperm2i128 \$0x20,$xd3,$xc3,$xc2
- vperm2i128 \$0x31,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
- ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
- ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
-$code.=<<___;
- cmp \$64*8,$len
- jb .Ltail8xvl
-
- mov \$0x80,%eax # size optimization
- vpxord 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- lea ($inp,%rax),$inp # size optimization
- vmovdqu32 $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vpxor 0x40($inp),$xc1,$xc1
- vpxor 0x60($inp),$xd1,$xd1
- lea ($inp,%rax),$inp # size optimization
- vmovdqu $xa1,0x00($out)
- vmovdqu $xb1,0x20($out)
- vmovdqu $xc1,0x40($out)
- vmovdqu $xd1,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxord 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vpxor 0x40($inp),$xc2,$xc2
- vpxor 0x60($inp),$xd2,$xd2
- lea ($inp,%rax),$inp # size optimization
- vmovdqu32 $xa2,0x00($out)
- vmovdqu $xb2,0x20($out)
- vmovdqu $xc2,0x40($out)
- vmovdqu $xd2,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vpxor 0x40($inp),$xc3,$xc3
- vpxor 0x60($inp),$xd3,$xd3
- lea ($inp,%rax),$inp # size optimization
- vmovdqu $xa3,0x00($out)
- vmovdqu $xb3,0x20($out)
- vmovdqu $xc3,0x40($out)
- vmovdqu $xd3,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpbroadcastd 0(%r9),%ymm0 # reload key
- vpbroadcastd 4(%r9),%ymm1
-
- sub \$64*8,$len
- jnz .Loop_outer8xvl
-
- jmp .Ldone8xvl
-
-.align 32
-.Ltail8xvl:
- vmovdqa64 $xa0,%ymm8 # size optimization
-___
-$xa0 = "%ymm8";
-$code.=<<___;
- xor %r9,%r9
- sub $inp,$out
- cmp \$64*1,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vmovdqu $xa0,0x00($out,$inp)
- vmovdqu $xb0,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc0,$xa0
- vmovdqa $xd0,$xb0
- lea 64($inp),$inp
-
- cmp \$64*2,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc0,$xc0
- vpxor 0x20($inp),$xd0,$xd0
- vmovdqu $xc0,0x00($out,$inp)
- vmovdqu $xd0,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xa1,$xa0
- vmovdqa $xb1,$xb0
- lea 64($inp),$inp
-
- cmp \$64*3,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vmovdqu $xa1,0x00($out,$inp)
- vmovdqu $xb1,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc1,$xa0
- vmovdqa $xd1,$xb0
- lea 64($inp),$inp
-
- cmp \$64*4,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc1,$xc1
- vpxor 0x20($inp),$xd1,$xd1
- vmovdqu $xc1,0x00($out,$inp)
- vmovdqu $xd1,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa32 $xa2,$xa0
- vmovdqa $xb2,$xb0
- lea 64($inp),$inp
-
- cmp \$64*5,$len
- jb .Less_than_64_8xvl
- vpxord 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vmovdqu32 $xa2,0x00($out,$inp)
- vmovdqu $xb2,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc2,$xa0
- vmovdqa $xd2,$xb0
- lea 64($inp),$inp
-
- cmp \$64*6,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc2,$xc2
- vpxor 0x20($inp),$xd2,$xd2
- vmovdqu $xc2,0x00($out,$inp)
- vmovdqu $xd2,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xa3,$xa0
- vmovdqa $xb3,$xb0
- lea 64($inp),$inp
-
- cmp \$64*7,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vmovdqu $xa3,0x00($out,$inp)
- vmovdqu $xb3,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc3,$xa0
- vmovdqa $xd3,$xb0
- lea 64($inp),$inp
-
-.Less_than_64_8xvl:
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x20(%rsp)
- lea ($out,$inp),$out
- and \$63,$len
-
-.Loop_tail8xvl:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail8xvl
-
- vpxor $xa0,$xa0,$xa0
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xa0,0x20(%rsp)
-
-.Ldone8xvl:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L8xvl_epilogue:
- ret
-.cfi_endproc
-.size chacha20_8xvl,.-chacha20_8xvl
-___
-if($kernel) {
- $code .= "#endif\n";
-}
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- lea .Lctr32_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lno_data(%rip),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 64+24+48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
-.size se_handler,.-se_handler
-
-.type simd_handler,\@abi-omnipotent
-.align 16
-simd_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 192($context),%rax # pull context->R9
-
- mov 4(%r11),%r10d # HandlerData[1]
- mov 8(%r11),%ecx # HandlerData[2]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- neg %rcx
- lea -8(%rax,%rcx),%rsi
- lea 512($context),%rdi # &context.Xmm6
- neg %ecx
- shr \$3,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
- jmp .Lcommon_seh_tail
-.size simd_handler,.-simd_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_chacha20_ctr32
- .rva .LSEH_end_chacha20_ctr32
- .rva .LSEH_info_chacha20_ctr32
-
- .rva .LSEH_begin_chacha20_ssse3
- .rva .LSEH_end_chacha20_ssse3
- .rva .LSEH_info_chacha20_ssse3
-
- .rva .LSEH_begin_chacha20_128
- .rva .LSEH_end_chacha20_128
- .rva .LSEH_info_chacha20_128
-
- .rva .LSEH_begin_chacha20_4x
- .rva .LSEH_end_chacha20_4x
- .rva .LSEH_info_chacha20_4x
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_chacha20_xop
- .rva .LSEH_end_chacha20_xop
- .rva .LSEH_info_chacha20_xop
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_chacha20_avx2
- .rva .LSEH_end_chacha20_avx2
- .rva .LSEH_info_chacha20_avx2
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_chacha20_avx512
- .rva .LSEH_end_chacha20_avx512
- .rva .LSEH_info_chacha20_avx512
-
- .rva .LSEH_begin_chacha20_avx512vl
- .rva .LSEH_end_chacha20_avx512vl
- .rva .LSEH_info_chacha20_avx512vl
-
- .rva .LSEH_begin_chacha20_16x
- .rva .LSEH_end_chacha20_16x
- .rva .LSEH_info_chacha20_16x
-
- .rva .LSEH_begin_chacha20_8xvl
- .rva .LSEH_end_chacha20_8xvl
- .rva .LSEH_info_chacha20_8xvl
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_chacha20_ctr32:
- .byte 9,0,0,0
- .rva se_handler
-
-.LSEH_info_chacha20_ssse3:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lssse3_body,.Lssse3_epilogue
- .long 0x20,0
-
-.LSEH_info_chacha20_128:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L128_body,.L128_epilogue
- .long 0x60,0
-
-.LSEH_info_chacha20_4x:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L4x_body,.L4x_epilogue
- .long 0xa0,0
-___
-$code.=<<___ if ($avx);
-.LSEH_info_chacha20_xop:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
- .long 0xa0,0
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_chacha20_avx2:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L8x_body,.L8x_epilogue # HandlerData[]
- .long 0xa0,0
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_chacha20_avx512:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
- .long 0x20,0
-
-.LSEH_info_chacha20_avx512vl:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
- .long 0x20,0
-
-.LSEH_info_chacha20_16x:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L16x_body,.L16x_epilogue # HandlerData[]
- .long 0xa0,0
-
-.LSEH_info_chacha20_8xvl:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
- .long 0xa0,0
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
-
- s/%x#%[yz]/%x/g; # "down-shift"
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c
deleted file mode 100644
index b78f19975b1d..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Implementation of the ChaCha20 stream cipher.
- *
- * Information: https://cr.yp.to/chacha.html
- */
-
-#include <zinc/chacha20.h>
-#include "../selftest/run.h"
-#define IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
-
-#define IS_ENABLED_CONFIG_64BIT (sizeof(void*) == 8)
-
-void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
-{
- int relalign = 0;
-
- if (!IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) {
- int size = sizeof(unsigned long);
- int d = (((unsigned long)dst ^ (unsigned long)src1) |
- ((unsigned long)dst ^ (unsigned long)src2)) &
- (size - 1);
-
- relalign = d ? 1 << ffs(d) : size;
-
- /*
- * If we care about alignment, process as many bytes as
- * needed to advance dst and src to values whose alignments
- * equal their relative alignment. This will allow us to
- * process the remainder of the input using optimal strides.
- */
- while (((unsigned long)dst & (relalign - 1)) && len > 0) {
- *dst++ = *src1++ ^ *src2++;
- len--;
- }
- }
-
- while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
- *(u64 *)dst = *(const u64 *)src1 ^ *(const u64 *)src2;
- dst += 8;
- src1 += 8;
- src2 += 8;
- len -= 8;
- }
-
- while (len >= 4 && !(relalign & 3)) {
- *(u32 *)dst = *(const u32 *)src1 ^ *(const u32 *)src2;
- dst += 4;
- src1 += 4;
- src2 += 4;
- len -= 4;
- }
-
- while (len >= 2 && !(relalign & 1)) {
- *(u16 *)dst = *(const u16 *)src1 ^ *(const u16 *)src2;
- dst += 2;
- src1 += 2;
- src2 += 2;
- len -= 2;
- }
-
- while (len--)
- *dst++ = *src1++ ^ *src2++;
-}
-
-#if defined(CONFIG_ZINC_ARCH_X86_64)
-#include "chacha20-x86_64-glue.c"
-#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
-#include "chacha20-arm-glue.c"
-#elif defined(CONFIG_ZINC_ARCH_MIPS)
-#include "chacha20-mips-glue.c"
-#else
-static bool *const chacha20_nobs[] __initconst = { };
-static void __init chacha20_fpu_init(void)
-{
-}
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- return false;
-}
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- return false;
-}
-#endif
-
-#define QUARTER_ROUND(x, a, b, c, d) ( \
- x[a] += x[b], \
- x[d] = rol32((x[d] ^ x[a]), 16), \
- x[c] += x[d], \
- x[b] = rol32((x[b] ^ x[c]), 12), \
- x[a] += x[b], \
- x[d] = rol32((x[d] ^ x[a]), 8), \
- x[c] += x[d], \
- x[b] = rol32((x[b] ^ x[c]), 7) \
-)
-
-#define C(i, j) (i * 4 + j)
-
-#define DOUBLE_ROUND(x) ( \
- /* Column Round */ \
- QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
- QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
- QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
- QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
- /* Diagonal Round */ \
- QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
- QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
- QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
- QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
-)
-
-#define TWENTY_ROUNDS(x) ( \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x) \
-)
-
-static void chacha20_block_generic(struct chacha20_ctx *ctx, __le32 *stream)
-{
- u32 x[CHACHA20_BLOCK_WORDS];
- int i;
-
- for (i = 0; i < ARRAY_SIZE(x); ++i)
- x[i] = ctx->state[i];
-
- TWENTY_ROUNDS(x);
-
- for (i = 0; i < ARRAY_SIZE(x); ++i)
- stream[i] = cpu_to_le32(x[i] + ctx->state[i]);
-
- ctx->counter[0] += 1;
-}
-
-static void chacha20_generic(struct chacha20_ctx *ctx, u8 *out, const u8 *in,
- u32 len)
-{
- __le32 buf[CHACHA20_BLOCK_WORDS];
-
- while (len >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_generic(ctx, buf);
- crypto_xor_cpy(out, in, (u8 *)buf, CHACHA20_BLOCK_SIZE);
- len -= CHACHA20_BLOCK_SIZE;
- out += CHACHA20_BLOCK_SIZE;
- in += CHACHA20_BLOCK_SIZE;
- }
- if (len) {
- chacha20_block_generic(ctx, buf);
- crypto_xor_cpy(out, in, (u8 *)buf, len);
- }
-}
-
-void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len,
- simd_context_t *simd_context)
-{
- if (!chacha20_arch(ctx, dst, src, len, simd_context))
- chacha20_generic(ctx, dst, src, len);
-}
-EXPORT_SYMBOL(chacha20);
-
-static void hchacha20_generic(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE])
-{
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
-
- TWENTY_ROUNDS(x);
-
- memcpy(derived_key + 0, x + 0, sizeof(u32) * 4);
- memcpy(derived_key + 4, x + 12, sizeof(u32) * 4);
-}
-
-/* Derived key should be 32-bit aligned */
-void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context)
-{
- if (!hchacha20_arch(derived_key, nonce, key, simd_context))
- hchacha20_generic(derived_key, nonce, key);
-}
-EXPORT_SYMBOL(hchacha20);
-
-#include "../selftest/chacha20.c"
-
-static bool nosimd __initdata = false;
-
-#ifndef COMPAT_ZINC_IS_A_MODULE
-int __init chacha20_mod_init(void)
-#else
-static int __init mod_init(void)
-#endif
-{
- if (!nosimd)
- chacha20_fpu_init();
- if (!selftest_run("chacha20", chacha20_selftest, chacha20_nobs,
- ARRAY_SIZE(chacha20_nobs)))
- return -ENOTRECOVERABLE;
- return 0;
-}
-
-#ifdef COMPAT_ZINC_IS_A_MODULE
-static void __exit mod_exit(void)
-{
-}
-
-module_init(mod_init);
-module_exit(mod_exit);
-#endif