diff options
Diffstat (limited to 'crypto/modes/asm')
-rwxr-xr-x | crypto/modes/asm/aes-gcm-armv8_64.pl | 6097 | ||||
-rw-r--r-- | crypto/modes/asm/aesni-gcm-x86_64.pl | 12 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-alpha.pl | 467 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-armv4.pl | 17 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-c64xplus.pl | 5 | ||||
-rwxr-xr-x | crypto/modes/asm/ghash-ia64.pl | 2 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-parisc.pl | 11 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-s390x.pl | 10 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-sparcv9.pl | 12 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-x86.pl | 5 | ||||
-rw-r--r-- | crypto/modes/asm/ghash-x86_64.pl | 18 | ||||
-rwxr-xr-x | crypto/modes/asm/ghashp8-ppc.pl | 11 | ||||
-rw-r--r-- | crypto/modes/asm/ghashv8-armx.pl | 43 |
13 files changed, 6657 insertions, 53 deletions
diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl new file mode 100755 index 000000000000..302b6f5a8ec8 --- /dev/null +++ b/crypto/modes/asm/aes-gcm-armv8_64.pl @@ -0,0 +1,6097 @@ +#! /usr/bin/env perl +# Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +#======================================================================== +# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project, +# derived from https://github.com/ARM-software/AArch64cryptolib, original +# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual +# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you +# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/. +#======================================================================== +# +# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants +# +# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks +# +# ____________________________________________________ +# | | +# | PRE | +# |____________________________________________________| +# | | | | +# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | +# |________________|________________|__________________| +# | | | | +# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | +# |________________|________________|__________________| +# | | | | +# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | +# |________________|________________|__________________| +# | | | | +# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | +# |________________|____(mostly)____|__________________| +# | | +# | MODULO | +# |____________________________________________________| +# +# PRE: +# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0 +# EXT low_acc, low_acc, low_acc, #8 +# EOR res_curr (4k+0), res_curr (4k+0), low_acc +# +# CTR block: +# Increment and byte reverse counter in scalar registers and transfer to SIMD registers +# REV ctr32, rev_ctr32 +# ORR ctr64, constctr96_top32, ctr32, LSL #32 +# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF +# INS ctr_next.d[1], ctr64X +# ADD rev_ctr32, #1 +# +# AES block: +# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example. +# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring +# Given we are very constrained in our ASIMD registers this is quite important +# +# Encrypt: +# LDR input_low, [ input_ptr ], #8 +# LDR input_high, [ input_ptr ], #8 +# EOR input_low, k14_low +# EOR input_high, k14_high +# INS res_curr.d[0], input_low +# INS res_curr.d[1], input_high +# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k13 +# EOR res_curr, res_curr, ctr_curr +# ST1 { res_curr.16b }, [ output_ptr ], #16 +# +# Decrypt: +# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr +# AESE ctr_curr, k13 +# LDR res_curr, [ input_ptr ], #16 +# EOR res_curr, res_curr, ctr_curr +# MOV output_low, res_curr.d[0] +# MOV output_high, res_curr.d[1] +# EOR output_low, k14_low +# EOR output_high, k14_high +# STP output_low, output_high, [ output_ptr ], #16 +# +# GHASH block X: +# do 128b karatsuba polynomial multiplication on block +# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b +# +# multiplication: +# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 +# +# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies: +# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64 +# +# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are +# multiplying with "twisted" powers of H +# +# Note: We can PMULL directly into the acc_x in first GHASH of the loop +# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical +# path latency dominates the performance +# +# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers +# than indicated here +# REV64 res_curr, res_curr +# INS t_m.d[0], res_curr.d[1] +# EOR t_m.8B, t_m.8B, res_curr.8B +# PMULL2 t_h, res_curr, HX +# PMULL t_l, res_curr, HX +# PMULL t_m, t_m, HX_k +# EOR acc_h, acc_h, t_h +# EOR acc_l, acc_l, t_l +# EOR acc_m, acc_m, t_m +# +# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them +# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo +# with a reversed constant +# EOR acc_m, acc_m, acc_h +# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing +# PMULL t_mod, acc_h, mod_constant +# EXT acc_h, acc_h, acc_h, #8 +# EOR acc_m, acc_m, acc_h +# EOR acc_m, acc_m, t_mod +# PMULL acc_h, acc_m, mod_constant +# EXT acc_m, acc_m, acc_m, #8 +# EOR acc_l, acc_l, acc_h +# EOR acc_l, acc_l, acc_m + +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +$input_ptr="x0"; #argument block +$bit_length="x1"; +$output_ptr="x2"; +$current_tag="x3"; +$counter="x16"; +$cc="x8"; + +{ +my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); +my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); +my ($output_l0,$output_h0)=map("x$_",(6..7)); + +my $ctr32w="w9"; +my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15)); +my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); + +my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); +my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); +my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); +my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); + +my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); +my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); +my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); + +my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); +my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); +my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); + +my $t0="v8"; +my $t0d="d8"; + +my ($t1,$t2,$t3)=map("v$_",(28..30)); +my ($t1d,$t2d,$t3d)=map("d$_",(28..30)); + +my $t4="v8"; +my $t4d="d8"; +my $t5="v28"; +my $t5d="d28"; +my $t6="v31"; +my $t6d="d31"; + +my $t7="v4"; +my $t7d="d4"; +my $t8="v29"; +my $t8d="d29"; +my $t9="v30"; +my $t9d="d30"; + +my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); +my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); +my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); + +my $mod_constantd="d8"; +my $mod_constant="v8"; +my $mod_t="v31"; + +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27)); +my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27)); +my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27)); +my $rk2q1="v20.1q"; +my $rk3q1="v21.1q"; +my $rk4v="v22"; +my $rk4d="d22"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_MAX_ARCH__>=8 +___ +$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour !~ /64/); +.fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) $_byte c,0xef,a,b +#else +.code 32 +# define INST(a,b,c,d) $_byte a,b,c,0xf2 +#endif + +.text +___ + +######################################################################################### +# size_t aes_gcm_enc_128_kernel(const unsigned char *in, +# size_t len, +# unsigned char *out, +# const void *key, +# unsigned char ivec[16], +# u64 *Xi); +# +$code.=<<___; +.global aes_gcm_enc_128_kernel +.type aes_gcm_enc_128_kernel,%function +.align 4 +aes_gcm_enc_128_kernel: + cbz x1, .L128_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 +#ifdef __AARCH64EB__ + ror $rk10_l, $rk10_l, #32 + ror $rk10_h, $rk10_h, #32 +#endif + ld1 {$acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + lsr $main_end_input_ptr, $bit_length, #3 @ byte_len + mov $len, $main_end_input_ptr + + ld1 {$rk0s}, [$cc], #16 @ load rk0 + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr + sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 + + lsr $rctr32x, $ctr96_t32x, #32 + ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ + ext $h4b, $h4b, $h4b, #8 +#endif + fmov $ctr1d, $ctr96_b64x @ CTR block 1 + rev $rctr32w, $rctr32w @ rev_ctr32 + + add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + ld1 {$rk1s}, [$cc], #16 @ load rk1 + + rev $ctr32w, $rctr32w @ CTR block 1 + add $rctr32w, $rctr32w, #1 @ CTR block 1 + fmov $ctr3d, $ctr96_b64x @ CTR block 3 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 + ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible + + fmov $ctr1.d[1], $ctr32x @ CTR block 1 + rev $ctr32w, $rctr32w @ CTR block 2 + + fmov $ctr2d, $ctr96_b64x @ CTR block 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 + add $rctr32w, $rctr32w, #1 @ CTR block 2 + + fmov $ctr2.d[1], $ctr32x @ CTR block 2 + rev $ctr32w, $rctr32w @ CTR block 3 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 + ld1 {$rk2s}, [$cc], #16 @ load rk2 + + add $rctr32w, $rctr32w, #1 @ CTR block 3 + fmov $ctr3.d[1], $ctr32x @ CTR block 3 + + ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ + ext $h3b, $h3b, $h3b, #8 +#endif + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 + ld1 {$rk3s}, [$cc], #16 @ load rk3 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 + ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ + ext $h1b, $h1b, $h1b, #8 +#endif + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 + ld1 {$rk4s}, [$cc], #16 @ load rk4 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 + ld1 {$rk5s}, [$cc], #16 @ load rk5 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 + trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 + ld1 {$rk6s}, [$cc], #16 @ load rk6 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 + ld1 {$rk7s}, [$cc], #16 @ load rk7 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 + trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 + ld1 {$rk8s}, [$cc], #16 @ load rk8 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 + ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ + ext $h2b, $h2b, $h2b, #8 +#endif + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 + eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 + ld1 {$rk9s}, [$cc], #16 @ load rk9 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 + + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 + trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 + + aese $ctr2b, $rk9 @ AES block 2 - round 9 + + aese $ctr0b, $rk9 @ AES block 0 - round 9 + + eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k + + aese $ctr1b, $rk9 @ AES block 1 - round 9 + + aese $ctr3b, $rk9 @ AES block 3 - round 9 + b.ge .L128_enc_tail @ handle tail + + ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif + ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif + ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif + eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low + eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high + + eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low + fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low + + eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low + eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high + fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high + + fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low + eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high + + eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low + fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high + + fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low + eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high + rev $ctr32w, $rctr32w @ CTR block 4 + + fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 + + eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result + fmov $ctr0d, $ctr96_b64x @ CTR block 4 + add $rctr32w, $rctr32w, #1 @ CTR block 4 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4 + rev $ctr32w, $rctr32w @ CTR block 5 + + eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result + fmov $ctr1d, $ctr96_b64x @ CTR block 5 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 + + add $rctr32w, $rctr32w, #1 @ CTR block 5 + add $input_ptr, $input_ptr, #64 @ AES input_ptr update + fmov $ctr1.d[1], $ctr32x @ CTR block 5 + + fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low + rev $ctr32w, $rctr32w @ CTR block 6 + st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result + + fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 + + add $rctr32w, $rctr32w, #1 @ CTR block 6 + eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result + st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result + + fmov $ctr2d, $ctr96_b64x @ CTR block 6 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + + fmov $ctr2.d[1], $ctr32x @ CTR block 6 + rev $ctr32w, $rctr32w @ CTR block 7 + st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 + + eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result + st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result + b.ge .L128_enc_prepretail @ do prepretail + + .L128_enc_main_loop: @ main loop start + ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif + rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) + rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 + + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + eor $res0b, $res0b, $acc_lb @ PRE 1 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + rev $ctr32w, $rctr32w @ CTR block 4k+8 + + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high + + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + movi $mod_constant.8b, #0xc2 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high + + add $input_ptr, $input_ptr, #64 @ AES input_ptr update + fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 + eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low + eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low + + aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 + fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result + + fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 + rev $ctr32w, $rctr32w @ CTR block 4k+9 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result + + add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 + fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 + + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 + rev $ctr32w, $rctr32w @ CTR block 4k+10 + + aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 + st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result + eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 + + aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 + + eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low + st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result + + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 + st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result + rev $ctr32w, $rctr32w @ CTR block 4k+11 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 + eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result + b.lt .L128_enc_main_loop + + .L128_enc_prepretail: @ PREPRETAIL + rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 + rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) + + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + + rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + eor $res0b, $res0b, $acc_lb @ PRE 1 + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + movi $mod_constant.8b, #0xc2 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + + pmull $t1.1q, $acc_h.1d, $mod_constant.1d + eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + ext $acc_hb, $acc_hb, $acc_hb, #8 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + eor $acc_mb, $acc_mb, $acc_lb + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + eor $acc_mb, $acc_mb, $t1.16b + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + eor $acc_mb, $acc_mb, $acc_hb + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + + pmull $t1.1q, $acc_m.1d, $mod_constant.1d + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + ext $acc_mb, $acc_mb, $acc_mb, #8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + eor $acc_lb, $acc_lb, $t1.16b + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + + aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + + aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 + + aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 + eor $acc_lb, $acc_lb, $acc_mb + + aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 + .L128_enc_tail: @ TAIL + + sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + cmp $main_end_input_ptr, #48 + + ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag + eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low + eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high + + fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low + + fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high + + eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result + + b.gt .L128_enc_blocks_more_than_3 + + sub $rctr32w, $rctr32w, #1 + movi $acc_l.8b, #0 + mov $ctr3b, $ctr2b + + cmp $main_end_input_ptr, #32 + mov $ctr2b, $ctr1b + movi $acc_h.8b, #0 + + movi $acc_m.8b, #0 + b.gt .L128_enc_blocks_more_than_2 + + mov $ctr3b, $ctr1b + cmp $main_end_input_ptr, #16 + + sub $rctr32w, $rctr32w, #1 + b.gt .L128_enc_blocks_more_than_1 + + sub $rctr32w, $rctr32w, #1 + b .L128_enc_blocks_less_than_1 + .L128_enc_blocks_more_than_3: @ blocks left > 3 + st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result + + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + rev64 $res0b, $res1b @ GHASH final-3 block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high + eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low + + fmov $res1d, $input_l0 @ AES final-2 block - mov low + + movi $t0.8b, #0 @ suppress further partial tag feed in + fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low + mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high + + mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid + + eor $res1b, $res1b, $ctr1b @ AES final-2 block - result + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid + + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid + .L128_enc_blocks_more_than_2: @ blocks left > 2 + + st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result + + rev64 $res0b, $res1b @ GHASH final-2 block + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low + + fmov $res1d, $input_l0 @ AES final-1 block - mov low + eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high + + pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high + fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high + + mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid + + pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid + + eor $res1b, $res1b, $ctr2b @ AES final-1 block - result + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low + + pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid + .L128_enc_blocks_more_than_1: @ blocks left > 1 + + st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result + + rev64 $res0b, $res1b @ GHASH final-1 block + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high + eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low + + fmov $res1d, $input_l0 @ AES final block - mov low + + pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high + fmov $res1.d[1], $input_h0 @ AES final block - mov high + + mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid + + pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid + + eor $res1b, $res1b, $ctr3b @ AES final block - result + + ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid + + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid + movi $t0.8b, #0 @ suppress further partial tag feed in + .L128_enc_blocks_less_than_1: @ blocks left <= 1 + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff + + mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff + sub $bit_length, $bit_length, #128 @ bit_length -= 128 + + neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block + cmp $bit_length, #64 + + csel $input_l0, $rk10_l, $rk10_h, lt + csel $input_h0, $rk10_h, xzr, lt + + fmov $ctr0d, $input_l0 @ ctr0b is mask for last block + + fmov $ctr0.d[1], $input_h0 + + and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits + + rev64 $res0b, $res1b @ GHASH final block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + mov $t0d, $res0.d[1] @ GHASH final block - mid + + pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low + ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid +#ifndef __AARCH64EB__ + rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif + pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high + + pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high + + eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid + movi $mod_constant.8b, #0xc2 + + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing + + eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low + st1 { $res1b}, [$output_ptr] @ store all 16B + + str $ctr32w, [$counter, #12] @ store the updated counter + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L128_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel +___ + +######################################################################################### +# size_t aes_gcm_dec_128_kernel(const unsigned char *in, +# size_t len, +# unsigned char *out, +# const void *key, +# unsigned char ivec[16], +# u64 *Xi); +# +$code.=<<___; +.global aes_gcm_dec_128_kernel +.type aes_gcm_dec_128_kernel,%function +.align 4 +aes_gcm_dec_128_kernel: + cbz x1, .L128_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr $main_end_input_ptr, $bit_length, #3 @ byte_len + mov $len, $main_end_input_ptr + ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 +#ifdef __AARCH64EB__ + ror $rk10_h, $rk10_h, 32 + ror $rk10_l, $rk10_l, 32 +#endif + sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 + ld1 {$rk0s}, [$cc], #16 @ load rk0 + + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible + + ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ + ext $h2b, $h2b, $h2b, #8 +#endif + lsr $rctr32x, $ctr96_t32x, #32 + fmov $ctr2d, $ctr96_b64x @ CTR block 2 + + ld1 {$rk1s}, [$cc], #16 @ load rk1 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + rev $rctr32w, $rctr32w @ rev_ctr32 + + fmov $ctr1d, $ctr96_b64x @ CTR block 1 + add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 + rev $ctr32w, $rctr32w @ CTR block 1 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 + ld1 {$rk2s}, [$cc], #16 @ load rk2 + add $rctr32w, $rctr32w, #1 @ CTR block 1 + + fmov $ctr1.d[1], $ctr32x @ CTR block 1 + rev $ctr32w, $rctr32w @ CTR block 2 + add $rctr32w, $rctr32w, #1 @ CTR block 2 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 + + fmov $ctr2.d[1], $ctr32x @ CTR block 2 + rev $ctr32w, $rctr32w @ CTR block 3 + + fmov $ctr3d, $ctr96_b64x @ CTR block 3 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 + add $rctr32w, $rctr32w, #1 @ CTR block 3 + + fmov $ctr3.d[1], $ctr32x @ CTR block 3 + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 + ld1 {$rk3s}, [$cc], #16 @ load rk3 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 + ld1 {$rk4s}, [$cc], #16 @ load rk4 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 + ld1 {$rk5s}, [$cc], #16 @ load rk5 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 + ld1 {$rk6s}, [$cc], #16 @ load rk6 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 + ld1 {$rk7s}, [$cc], #16 @ load rk7 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 + ld1 {$rk8s}, [$cc], #16 @ load rk8 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 + ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ + ext $h3b, $h3b, $h3b, #8 +#endif + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 + ld1 {$rk9s}, [$cc], #16 @ load rk9 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 + ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ + ext $h1b, $h1b, $h1b, #8 +#endif + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 + trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h + + ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ + ext $h4b, $h4b, $h4b, #8 +#endif + trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 + eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 + trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 + trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h + + aese $ctr2b, $rk9 @ AES block 2 - round 9 + + aese $ctr3b, $rk9 @ AES block 3 - round 9 + + aese $ctr0b, $rk9 @ AES block 0 - round 9 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + + aese $ctr1b, $rk9 @ AES block 1 - round 9 + eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k + b.ge .L128_dec_tail @ handle tail + + ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext + + eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result + ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext + + eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result + rev64 $res0b, $res0b @ GHASH block 0 + rev $ctr32w, $rctr32w @ CTR block 4 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 + add $rctr32w, $rctr32w, #1 @ CTR block 4 + ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext + + rev64 $res1b, $res1b @ GHASH block 1 + mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low + + mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high + + mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + + mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high + + fmov $ctr0d, $ctr96_b64x @ CTR block 4 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4 + rev $ctr32w, $rctr32w @ CTR block 5 + eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif + fmov $ctr1d, $ctr96_b64x @ CTR block 5 + add $rctr32w, $rctr32w, #1 @ CTR block 5 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 + + fmov $ctr1.d[1], $ctr32x @ CTR block 5 + rev $ctr32w, $rctr32w @ CTR block 6 + add $rctr32w, $rctr32w, #1 @ CTR block 6 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 + + eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif + eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result + + eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result + + stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result + b.ge .L128_dec_prepretail @ do prepretail + + .L128_dec_main_loop: @ main loop start + eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 + + rev64 $res2b, $res2b @ GHASH block 4k+2 + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 + rev $ctr32w, $rctr32w @ CTR block 4k+7 + + mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low + eor $res0b, $res0b, $acc_lb @ PRE 1 + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + rev64 $res3b, $res3b @ GHASH block 4k+3 + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + movi $mod_constant.8b, #0xc2 + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + rev $ctr32w, $rctr32w @ CTR block 4k+8 + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext + + add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + + rev64 $res1b, $res1b @ GHASH block 4k+5 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 + rev $ctr32w, $rctr32w @ CTR block 4k+9 + + aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high + eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result + mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low + add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 + + aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 + fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 + cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + + rev64 $res0b, $res0b @ GHASH block 4k+4 + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 + + rev $ctr32w, $rctr32w @ CTR block 4k+10 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 + + eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result + + eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif + stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 + b.lt L128_dec_main_loop + + .L128_dec_prepretail: @ PREPRETAIL + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high + + eor $res0b, $res0b, $acc_lb @ PRE 1 + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 + rev64 $res2b, $res2b @ GHASH block 4k+2 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 + + rev $ctr32w, $rctr32w @ CTR block 4k+7 + mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + rev64 $res3b, $res3b @ GHASH block 4k+3 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + movi $mod_constant.8b, #0xc2 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif + aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 + stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result + + aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 + stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result + + aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + .L128_dec_tail: @ TAIL + + sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext + + eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result + + mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high + + mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low + + cmp $main_end_input_ptr, #48 + + eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag + eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + b.gt .L128_dec_blocks_more_than_3 + + mov $ctr3b, $ctr2b + sub $rctr32w, $rctr32w, #1 + movi $acc_l.8b, #0 + + movi $acc_h.8b, #0 + mov $ctr2b, $ctr1b + + movi $acc_m.8b, #0 + cmp $main_end_input_ptr, #32 + b.gt .L128_dec_blocks_more_than_2 + + cmp $main_end_input_ptr, #16 + + mov $ctr3b, $ctr1b + sub $rctr32w, $rctr32w, #1 + b.gt .L128_dec_blocks_more_than_1 + + sub $rctr32w, $rctr32w, #1 + b .L128_dec_blocks_less_than_1 + .L128_dec_blocks_more_than_3: @ blocks left > 3 + rev64 $res0b, $res1b @ GHASH final-3 block + ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result + eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result + + mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid + mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low + mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid + eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + .L128_dec_blocks_more_than_2: @ blocks left > 2 + + rev64 $res0b, $res1b @ GHASH final-2 block + ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result + + mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid + + pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low + + pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high + mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low + + mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + + pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid + + eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid + eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + .L128_dec_blocks_more_than_1: @ blocks left > 1 + + rev64 $res0b, $res1b @ GHASH final-1 block + + ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid + + eor $ctr0b, $res1b, $ctr3b @ AES final block - result + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid + + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result + mov $output_l0, $ctr0.d[0] @ AES final block - mov low + + mov $output_h0, $ctr0.d[1] @ AES final block - mov high + ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid + + pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low + + pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high + + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid + movi $t0.8b, #0 @ suppress further partial tag feed in + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high + eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid + .L128_dec_blocks_less_than_1: @ blocks left <= 1 + + mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff + sub $bit_length, $bit_length, #128 @ bit_length -= 128 + + neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block + cmp $bit_length, #64 + + csel $ctr96_b64x, $rk10_h, xzr, lt + csel $ctr32x, $rk10_l, $rk10_h, lt + + fmov $ctr0d, $ctr32x @ ctr0b is mask for last block + + mov $ctr0.d[1], $ctr96_b64x + + and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits + + rev64 $res0b, $res1b @ GHASH final block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite + + and $output_h0, $output_h0, $ctr96_b64x + + pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high + mov $t0d, $res0.d[1] @ GHASH final block - mid + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid + eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high + + pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid + + pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low + bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes + and $output_l0, $output_l0, $ctr32x + +#ifndef __AARCH64EB__ + rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif + + eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid + movi $mod_constant.8b, #0xc2 + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low + + bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + orr $output_l0, $output_l0, $end_input_ptr + str $ctr32w, [$counter, #12] @ store the updated counter + + orr $output_h0, $output_h0, $main_end_input_ptr + stp $output_l0, $output_h0, [$output_ptr] + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + + .L128_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel +___ +} + +{ +my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); +my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); +my ($output_l0,$output_h0)=map("x$_",(6..7)); + +my $ctr32w="w9"; +my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15)); +my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); + +my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); +my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); +my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); +my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); + +my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); +my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); +my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); + +my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); +my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); +my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); + +my $t0="v8"; +my $t0d="d8"; +my $t3="v4"; +my $t3d="d4"; + +my ($t1,$t2)=map("v$_",(30..31)); +my ($t1d,$t2d)=map("d$_",(30..31)); + +my $t4="v30"; +my $t4d="d30"; +my $t5="v8"; +my $t5d="d8"; +my $t6="v31"; +my $t6d="d31"; + +my $t7="v5"; +my $t7d="d5"; +my $t8="v6"; +my $t8d="d6"; +my $t9="v30"; +my $t9d="d30"; + +my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); +my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); +my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); + +my $mod_constantd="d8"; +my $mod_constant="v8"; +my $mod_t="v31"; + +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29)); +my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29)); +my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29)); +my $rk2q1="v20.1q"; +my $rk3q1="v21.1q"; +my $rk4v="v22"; +my $rk4d="d22"; + +######################################################################################### +# size_t aes_gcm_enc_192_kernel(const unsigned char *in, +# size_t len, +# unsigned char *out, +# const void *key, +# unsigned char ivec[16], +# u64 *Xi); +# +$code.=<<___; +.global aes_gcm_enc_192_kernel +.type aes_gcm_enc_192_kernel,%function +.align 4 +aes_gcm_enc_192_kernel: + cbz x1, .L192_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 +#ifdef __AARCH64EB__ + ror $rk12_l, $rk12_l, #32 + ror $rk12_h, $rk12_h, #32 +#endif + ld1 {$rk0s}, [$cc], #16 @ load rk0 + + ld1 {$rk1s}, [$cc], #16 @ load rk1 + + ld1 {$rk2s}, [$cc], #16 @ load rk2 + + lsr $rctr32x, $ctr96_t32x, #32 + ld1 {$rk3s}, [$cc], #16 @ load rk3 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + + ld1 {$rk4s}, [$cc], #16 @ load rk4 + rev $rctr32w, $rctr32w @ rev_ctr32 + + add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 + fmov $ctr3d, $ctr96_b64x @ CTR block 3 + + rev $ctr32w, $rctr32w @ CTR block 1 + add $rctr32w, $rctr32w, #1 @ CTR block 1 + fmov $ctr1d, $ctr96_b64x @ CTR block 1 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 + ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible + + fmov $ctr1.d[1], $ctr32x @ CTR block 1 + rev $ctr32w, $rctr32w @ CTR block 2 + add $rctr32w, $rctr32w, #1 @ CTR block 2 + + fmov $ctr2d, $ctr96_b64x @ CTR block 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 + + fmov $ctr2.d[1], $ctr32x @ CTR block 2 + rev $ctr32w, $rctr32w @ CTR block 3 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 + ld1 {$rk5s}, [$cc], #16 @ load rk5 + + fmov $ctr3.d[1], $ctr32x @ CTR block 3 + + ld1 {$rk6s}, [$cc], #16 @ load rk6 + + ld1 {$rk7s}, [$cc], #16 @ load rk7 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 + ld1 {$rk8s}, [$cc], #16 @ load rk8 + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 + ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ + ext $h4b, $h4b, $h4b, #8 +#endif + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 + ld1 {$rk9s}, [$cc], #16 @ load rk9 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 + ld1 {$rk10s}, [$cc], #16 @ load rk10 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 + ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ + ext $h1b, $h1b, $h1b, #8 +#endif + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 + ld1 {$rk11s}, [$cc], #16 @ load rk11 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 + ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ + ext $h3b, $h3b, $h3b, #8 +#endif + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 + trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 + trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 + ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ + ext $h2b, $h2b, $h2b, #8 +#endif + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 + trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 + trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 + lsr $main_end_input_ptr, $bit_length, #3 @ byte_len + mov $len, $main_end_input_ptr + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 + sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 + + eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k + + aese $ctr2b, $rk11 @ AES block 2 - round 11 + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + + aese $ctr1b, $rk11 @ AES block 1 - round 11 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + + aese $ctr0b, $rk11 @ AES block 0 - round 11 + add $rctr32w, $rctr32w, #1 @ CTR block 3 + + aese $ctr3b, $rk11 @ AES block 3 - round 11 + b.ge .L192_enc_tail @ handle tail + + rev $ctr32w, $rctr32w @ CTR block 4 + ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 + ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif + ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif + ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif + add $input_ptr, $input_ptr, #64 @ AES input_ptr update + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + + eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low + + eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high + eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high + fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low + + eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high + fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high + + eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low + eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low + + fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low + eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high + + fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high + + eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low + fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low + + add $rctr32w, $rctr32w, #1 @ CTR block 4 + eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result + fmov $ctr0d, $ctr96_b64x @ CTR block 4 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4 + rev $ctr32w, $rctr32w @ CTR block 5 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 + add $rctr32w, $rctr32w, #1 @ CTR block 5 + + fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low + st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result + + fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high + + eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result + fmov $ctr1d, $ctr96_b64x @ CTR block 5 + st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result + + fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high + + fmov $ctr1.d[1], $ctr32x @ CTR block 5 + rev $ctr32w, $rctr32w @ CTR block 6 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 + + add $rctr32w, $rctr32w, #1 @ CTR block 6 + eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result + fmov $ctr2d, $ctr96_b64x @ CTR block 6 + + fmov $ctr2.d[1], $ctr32x @ CTR block 6 + rev $ctr32w, $rctr32w @ CTR block 7 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 + st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result + + eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result + st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result + b.ge .L192_enc_prepretail @ do prepretail + + .L192_enc_main_loop: @ main loop start + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 + rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + eor $res0b, $res0b, $acc_lb @ PRE 1 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + rev $ctr32w, $rctr32w @ CTR block 4k+8 + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + add $input_ptr, $input_ptr, #64 @ AES input_ptr update + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + movi $mod_constant.8b, #0xc2 + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low + + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + + aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + + eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result + fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 + + aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 + fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 + rev $ctr32w, $rctr32w @ CTR block 4k+9 + + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high + st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 + + eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result + add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 + fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 + + aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 + fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 + rev $ctr32w, $rctr32w @ CTR block 4k+10 + + add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 + + st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result + eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low + + aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 + eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 + + st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 + rev $ctr32w, $rctr32w @ CTR block 4k+11 + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 + + eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result + st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result + b.lt .L192_enc_main_loop + + .L192_enc_prepretail: @ PREPRETAIL + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) + + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 + eor $res0b, $res0b, $acc_lb @ PRE 1 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + movi $mod_constant.8b, #0xc2 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + eor $acc_mb, $acc_mb, $acc_lb + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + pmull $t1.1q, $acc_h.1d, $mod_constant.1d + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + ext $acc_hb, $acc_hb, $acc_hb, #8 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + eor $acc_mb, $acc_mb, $t1.16b + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + eor $acc_mb, $acc_mb, $acc_hb + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + + pmull $t1.1q, $acc_m.1d, $mod_constant.1d + + ext $acc_mb, $acc_mb, $acc_mb, #8 + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + eor $acc_lb, $acc_lb, $t1.16b + + aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 + + aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 + + aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 + + aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 + eor $acc_lb, $acc_lb, $acc_mb + .L192_enc_tail: @ TAIL + + sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low + eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high + + fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low + + fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high + cmp $main_end_input_ptr, #48 + + eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result + + ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag + b.gt .L192_enc_blocks_more_than_3 + + sub $rctr32w, $rctr32w, #1 + movi $acc_m.8b, #0 + + mov $ctr3b, $ctr2b + movi $acc_h.8b, #0 + cmp $main_end_input_ptr, #32 + + mov $ctr2b, $ctr1b + movi $acc_l.8b, #0 + b.gt .L192_enc_blocks_more_than_2 + + sub $rctr32w, $rctr32w, #1 + + mov $ctr3b, $ctr1b + cmp $main_end_input_ptr, #16 + b.gt .L192_enc_blocks_more_than_1 + + sub $rctr32w, $rctr32w, #1 + b .L192_enc_blocks_less_than_1 + .L192_enc_blocks_more_than_3: @ blocks left > 3 + st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result + + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + rev64 $res0b, $res1b @ GHASH final-3 block + + eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high + fmov $res1d, $input_l0 @ AES final-2 block - mov low + + fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high + + mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low + + mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high + + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid + eor $res1b, $res1b, $ctr1b @ AES final-2 block - result + .L192_enc_blocks_more_than_2: @ blocks left > 2 + + st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result + + rev64 $res0b, $res1b @ GHASH final-2 block + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high + + pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high + mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid + + pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low + eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low + + fmov $res1d, $input_l0 @ AES final-1 block - mov low + + fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low + + pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + + eor $res1b, $res1b, $ctr2b @ AES final-1 block - result + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid + .L192_enc_blocks_more_than_1: @ blocks left > 1 + + st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result + + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + rev64 $res0b, $res1b @ GHASH final-1 block + + eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low + eor $res0b, $res0b, $t0.16b @ feed in partial tag + movi $t0.8b, #0 @ suppress further partial tag feed in + + mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid + eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high + fmov $res1d, $input_l0 @ AES final block - mov low + + pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high + fmov $res1.d[1], $input_h0 @ AES final block - mov high + + ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high + + pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low + + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid + + eor $res1b, $res1b, $ctr3b @ AES final block - result + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid + .L192_enc_blocks_less_than_1: @ blocks left <= 1 + + ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored +#ifndef __AARCH64EB__ + rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + sub $bit_length, $bit_length, #128 @ bit_length -= 128 + mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff + + neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) + mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block + cmp $bit_length, #64 + + csel $input_l0, $rk12_l, $rk12_h, lt + csel $input_h0, $rk12_h, xzr, lt + + fmov $ctr0d, $input_l0 @ ctr0b is mask for last block + + fmov $ctr0.d[1], $input_h0 + + and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits + + rev64 $res0b, $res1b @ GHASH final block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + mov $t0d, $res0.d[1] @ GHASH final block - mid + + pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low + + pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high + + pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid + + eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid + movi $mod_constant.8b, #0xc2 + + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing + + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low + str $ctr32w, [$counter, #12] @ store the updated counter + + st1 { $res1b}, [$output_ptr] @ store all 16B + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L192_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel +___ + +######################################################################################### +# size_t aes_gcm_dec_192_kernel(const unsigned char *in, +# size_t len, +# unsigned char *out, +# const void *key, +# unsigned char ivec[16], +# u64 *Xi); +# +$code.=<<___; +.global aes_gcm_dec_192_kernel +.type aes_gcm_dec_192_kernel,%function +.align 4 +aes_gcm_dec_192_kernel: + cbz x1, .L192_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr + ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 +#ifdef __AARCH64EB__ + ror $rk12_l, $rk12_l, #32 + ror $rk12_h, $rk12_h, #32 +#endif + ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible + + ld1 {$rk0s}, [$cc], #16 @ load rk0 + + lsr $main_end_input_ptr, $bit_length, #3 @ byte_len + mov $len, $main_end_input_ptr + ld1 {$rk1s}, [$cc], #16 @ load rk1 + + lsr $rctr32x, $ctr96_t32x, #32 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + fmov $ctr3d, $ctr96_b64x @ CTR block 3 + + rev $rctr32w, $rctr32w @ rev_ctr32 + fmov $ctr1d, $ctr96_b64x @ CTR block 1 + + add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 + ld1 {$rk2s}, [$cc], #16 @ load rk2 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 + rev $ctr32w, $rctr32w @ CTR block 1 + + add $rctr32w, $rctr32w, #1 @ CTR block 1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 + ld1 {$rk3s}, [$cc], #16 @ load rk3 + + fmov $ctr1.d[1], $ctr32x @ CTR block 1 + rev $ctr32w, $rctr32w @ CTR block 2 + add $rctr32w, $rctr32w, #1 @ CTR block 2 + + fmov $ctr2d, $ctr96_b64x @ CTR block 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 + + fmov $ctr2.d[1], $ctr32x @ CTR block 2 + rev $ctr32w, $rctr32w @ CTR block 3 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 + + fmov $ctr3.d[1], $ctr32x @ CTR block 3 + + ld1 {$rk4s}, [$cc], #16 @ load rk4 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 + ld1 {$rk5s}, [$cc], #16 @ load rk5 + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 + ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ + ext $h4b, $h4b, $h4b, #8 +#endif + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 + ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ + ext $h2b, $h2b, $h2b, #8 +#endif + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 + ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ + ext $h3b, $h3b, $h3b, #8 +#endif + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 + ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ + ext $h1b, $h1b, $h1b, #8 +#endif + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 + ld1 {$rk6s}, [$cc], #16 @ load rk6 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 + ld1 {$rk7s}, [$cc], #16 @ load rk7 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 + ld1 {$rk8s}, [$cc], #16 @ load rk8 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 + ld1 {$rk9s}, [$cc], #16 @ load rk9 + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 + add $rctr32w, $rctr32w, #1 @ CTR block 3 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 + trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 + ld1 {$rk10s}, [$cc], #16 @ load rk10 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 + trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 + trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 + ld1 {$rk11s}, [$cc], #16 @ load rk11 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 + sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 + trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h + + aese $ctr3b, $rk11 @ AES block 3 - round 11 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 + eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k + + aese $ctr2b, $rk11 @ AES block 2 - round 11 + + aese $ctr1b, $rk11 @ AES block 1 - round 11 + eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k + + aese $ctr0b, $rk11 @ AES block 0 - round 11 + b.ge .L192_dec_tail @ handle tail + + ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext + + eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result + + eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result + rev $ctr32w, $rctr32w @ CTR block 4 + ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext + + mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low + + mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high + + mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 + add $rctr32w, $rctr32w, #1 @ CTR block 4 + + mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high + rev64 $res0b, $res0b @ GHASH block 0 + + fmov $ctr0d, $ctr96_b64x @ CTR block 4 + rev64 $res1b, $res1b @ GHASH block 1 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + + eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif + fmov $ctr0.d[1], $ctr32x @ CTR block 4 + rev $ctr32w, $rctr32w @ CTR block 5 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 + fmov $ctr1d, $ctr96_b64x @ CTR block 5 + eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif + add $rctr32w, $rctr32w, #1 @ CTR block 5 + fmov $ctr1.d[1], $ctr32x @ CTR block 5 + eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + rev $ctr32w, $rctr32w @ CTR block 6 + eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 + + stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result + + add $rctr32w, $rctr32w, #1 @ CTR block 6 + eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result + b.ge .L192_dec_prepretail @ do prepretail + + .L192_dec_main_loop: @ main loop start + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low + + mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high + eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result + rev64 $res3b, $res3b @ GHASH block 4k+3 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + eor $res0b, $res0b, $acc_lb @ PRE 1 + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + rev $ctr32w, $rctr32w @ CTR block 4k+7 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 + + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + rev64 $res2b, $res2b @ GHASH block 4k+2 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + movi $mod_constant.8b, #0xc2 + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext + eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext + + aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 + ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext + rev $ctr32w, $rctr32w @ CTR block 4k+8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + + eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result + eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif + eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low + + mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low + stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result + rev64 $res1b, $res1b @ GHASH block 4k+5 + + aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 + mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high + + fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result + fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 + rev $ctr32w, $rctr32w @ CTR block 4k+9 + + eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 + eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif + fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 + rev $ctr32w, $rctr32w @ CTR block 4k+10 + eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif + eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + + add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 + rev64 $res0b, $res0b @ GHASH block 4k+4 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 + + aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 + stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result + b.lt .L192_dec_main_loop + + .L192_dec_prepretail: @ PREPRETAIL + mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + eor $res0b, $res0b, $acc_lb @ PRE 1 + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + rev64 $res2b, $res2b @ GHASH block 4k+2 + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 + rev $ctr32w, $rctr32w @ CTR block 4k+7 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif + stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result + + rev64 $res3b, $res3b @ GHASH block 4k+3 + stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + movi $mod_constant.8b, #0xc2 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + + aese $ctr0b, $rk11 + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + aese $ctr2b, $rk11 + + aese $ctr1b, $rk11 + + aese $ctr3b, $rk11 + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + .L192_dec_tail: @ TAIL + + sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext + + eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result + + mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high + + mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low + + ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag + + cmp $main_end_input_ptr, #48 + + eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + b.gt .L192_dec_blocks_more_than_3 + + movi $acc_l.8b, #0 + movi $acc_h.8b, #0 + + mov $ctr3b, $ctr2b + mov $ctr2b, $ctr1b + sub $rctr32w, $rctr32w, #1 + + movi $acc_m.8b, #0 + cmp $main_end_input_ptr, #32 + b.gt .L192_dec_blocks_more_than_2 + + mov $ctr3b, $ctr1b + cmp $main_end_input_ptr, #16 + sub $rctr32w, $rctr32w, #1 + + b.gt .L192_dec_blocks_more_than_1 + + sub $rctr32w, $rctr32w, #1 + b .L192_dec_blocks_less_than_1 + .L192_dec_blocks_more_than_3: @ blocks left > 3 + rev64 $res0b, $res1b @ GHASH final-3 block + ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext + + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low + mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low + mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid + + mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high + + mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high + + eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + movi $t0.8b, #0 @ suppress further partial tag feed in + + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid + eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + .L192_dec_blocks_more_than_2: @ blocks left > 2 + + rev64 $res0b, $res1b @ GHASH final-2 block + ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + movi $t0.8b, #0 @ suppress further partial tag feed in + + eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result + + mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid + + pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low + + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid + mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low + mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low + + pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high + + pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high + eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid + .L192_dec_blocks_more_than_1: @ blocks left > 1 + + rev64 $res0b, $res1b @ GHASH final-1 block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext + + mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid + + pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high + + eor $ctr0b, $res1b, $ctr3b @ AES final block - result + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high + + pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low + mov $output_h0, $ctr0.d[1] @ AES final block - mov high + + ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid + mov $output_l0, $ctr0.d[0] @ AES final block - mov low + + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low + eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid + .L192_dec_blocks_less_than_1: @ blocks left <= 1 + + mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff + ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + sub $bit_length, $bit_length, #128 @ bit_length -= 128 + + neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff + + lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block + cmp $bit_length, #64 + + csel $ctr32x, $rk12_l, $rk12_h, lt + csel $ctr96_b64x, $rk12_h, xzr, lt + + fmov $ctr0d, $ctr32x @ ctr0b is mask for last block + and $output_l0, $output_l0, $ctr32x + bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes + + orr $output_l0, $output_l0, $end_input_ptr + mov $ctr0.d[1], $ctr96_b64x +#ifndef __AARCH64EB__ + rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif + + and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits + str $ctr32w, [$counter, #12] @ store the updated counter + + rev64 $res0b, $res1b @ GHASH final block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes + + and $output_h0, $output_h0, $ctr96_b64x + + pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high + mov $t0d, $res0.d[1] @ GHASH final block - mid + + pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high + + pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low + + eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid + movi $mod_constant.8b, #0xc2 + + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + orr $output_h0, $output_h0, $main_end_input_ptr + stp $output_l0, $output_h0, [$output_ptr] + + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L192_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel +___ +} + +{ +my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); +my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); +my ($output_l0,$output_h0)=map("x$_",(6..7)); + +my $ctr32w="w9"; +my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15)); +my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); + +my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); +my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); +my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); +my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); + +my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); +my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); +my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); + +my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); +my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); +my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); + +my $t0="v8"; +my $t0d="d8"; +my $t1="v4"; +my $t1d="d4"; +my $t2="v8"; +my $t2d="d8"; +my $t3="v4"; +my $t3d="d4"; +my $t4="v4"; +my $t4d="d4"; +my $t5="v5"; +my $t5d="d5"; +my $t6="v8"; +my $t6d="d8"; +my $t7="v5"; +my $t7d="d5"; +my $t8="v6"; +my $t8d="d6"; +my $t9="v4"; +my $t9d="d4"; + +my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); +my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); +my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); + +my $mod_constantd="d8"; +my $mod_constant="v8"; +my $mod_t="v7"; + +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31)); +my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31)); +my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31)); +my $rk2q1="v20.1q"; +my $rk3q1="v21.1q"; +my $rk4v="v22"; +my $rk4d="d22"; + +######################################################################################### +# size_t aes_gcm_enc_256_kernel(const unsigned char *in, +# size_t len, +# unsigned char *out, +# const void *key, +# unsigned char ivec[16], +# u64 *Xi); +# +$code.=<<___; +.global aes_gcm_enc_256_kernel +.type aes_gcm_enc_256_kernel,%function +.align 4 +aes_gcm_enc_256_kernel: + cbz x1, .L256_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr + lsr $main_end_input_ptr, $bit_length, #3 @ byte_len + mov $len, $main_end_input_ptr + ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 +#ifdef __AARCH64EB__ + ror $rk14_l, $rk14_l, #32 + ror $rk14_h, $rk14_h, #32 +#endif + ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible + sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 + + ld1 {$rk0s}, [$cc], #16 @ load rk0 + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + ld1 {$rk1s}, [$cc], #16 @ load rk1 + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + + lsr $rctr32x, $ctr96_t32x, #32 + fmov $ctr2d, $ctr96_b64x @ CTR block 2 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + + rev $rctr32w, $rctr32w @ rev_ctr32 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + fmov $ctr1d, $ctr96_b64x @ CTR block 1 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 + add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 + + rev $ctr32w, $rctr32w @ CTR block 1 + fmov $ctr3d, $ctr96_b64x @ CTR block 3 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 + add $rctr32w, $rctr32w, #1 @ CTR block 1 + ld1 {$rk2s}, [$cc], #16 @ load rk2 + + fmov $ctr1.d[1], $ctr32x @ CTR block 1 + rev $ctr32w, $rctr32w @ CTR block 2 + add $rctr32w, $rctr32w, #1 @ CTR block 2 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 + ld1 {$rk3s}, [$cc], #16 @ load rk3 + + fmov $ctr2.d[1], $ctr32x @ CTR block 2 + rev $ctr32w, $rctr32w @ CTR block 3 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 + + fmov $ctr3.d[1], $ctr32x @ CTR block 3 + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 + ld1 {$rk4s}, [$cc], #16 @ load rk4 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 + ld1 {$rk5s}, [$cc], #16 @ load rk5 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 + ld1 {$rk6s}, [$cc], #16 @ load rk6 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 + ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ + ext $h3b, $h3b, $h3b, #8 +#endif + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 + ld1 {$rk7s}, [$cc], #16 @ load rk7 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 + ld1 {$rk8s}, [$cc], #16 @ load rk8 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 + ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ + ext $h2b, $h2b, $h2b, #8 +#endif + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 + ld1 {$rk9s}, [$cc], #16 @ load rk9 + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 + ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ + ext $h4b, $h4b, $h4b, #8 +#endif + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 + ld1 {$rk10s}, [$cc], #16 @ load rk10 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 + ld1 {$rk11s}, [$cc], #16 @ load rk11 + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 + add $rctr32w, $rctr32w, #1 @ CTR block 3 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 + trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 + ld1 {$rk12s}, [$cc], #16 @ load rk12 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 + ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ + ext $h1b, $h1b, $h1b, #8 +#endif + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 + ld1 {$rk13s}, [$cc], #16 @ load rk13 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 + trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 + trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 + + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 + + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 + + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 + eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k + + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 + + aese $ctr2b, $rk13 @ AES block 2 - round 13 + trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h + + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 + + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 + + aese $ctr1b, $rk13 @ AES block 1 - round 13 + + aese $ctr0b, $rk13 @ AES block 0 - round 13 + + aese $ctr3b, $rk13 @ AES block 3 - round 13 + eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k + b.ge .L256_enc_tail @ handle tail + + ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif + rev $ctr32w, $rctr32w @ CTR block 4 + ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif + ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif + add $input_ptr, $input_ptr, #64 @ AES input_ptr update + + eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low + eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high + + fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low + eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low + + eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high + eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high + fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low + + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high + eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low + + eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low + fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high + + fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low + add $rctr32w, $rctr32w, #1 @ CTR block 4 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 + fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low + eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high + + fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high + + eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result + fmov $ctr0d, $ctr96_b64x @ CTR block 4 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4 + rev $ctr32w, $rctr32w @ CTR block 5 + add $rctr32w, $rctr32w, #1 @ CTR block 5 + + eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result + fmov $ctr1d, $ctr96_b64x @ CTR block 5 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 + + fmov $ctr1.d[1], $ctr32x @ CTR block 5 + rev $ctr32w, $rctr32w @ CTR block 6 + st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result + + fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 + eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result + + st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result + + add $rctr32w, $rctr32w, #1 @ CTR block 6 + fmov $ctr2d, $ctr96_b64x @ CTR block 6 + + fmov $ctr2.d[1], $ctr32x @ CTR block 6 + st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result + rev $ctr32w, $rctr32w @ CTR block 7 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 + + eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result + st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result + b.ge L256_enc_prepretail @ do prepretail + + .L256_enc_main_loop: @ main loop start + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + eor $res0b, $res0b, $acc_lb @ PRE 1 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + movi $mod_constant.8b, #0xc2 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 + add $input_ptr, $input_ptr, #64 @ AES input_ptr update + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + rev $ctr32w, $rctr32w @ CTR block 4k+8 + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low + + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high + + fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 + eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 + eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high + + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 + eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high + + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 + + aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 + fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 + fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low + + aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 + fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high + + fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low + cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + + fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high + + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result + fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 + rev $ctr32w, $rctr32w @ CTR block 4k+9 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 + + eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result + fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 + + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 + fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 + + aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 + rev $ctr32w, $rctr32w @ CTR block 4k+10 + st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 + eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low + fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high + + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result + add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 + + aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 + eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 + + st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 + rev $ctr32w, $rctr32w @ CTR block 4k+11 + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 + + eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result + st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result + b.lt L256_enc_main_loop + + .L256_enc_prepretail: @ PREPRETAIL + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) + + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + + eor $res0b, $res0b, $acc_lb @ PRE 1 + rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + movi $mod_constant.8b, #0xc2 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + + eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up + + pmull $t1.1q, $acc_h.1d, $mod_constant.1d + ext $acc_hb, $acc_hb, $acc_hb, #8 + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + eor $acc_mb, $acc_mb, $acc_lb + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 + eor $acc_mb, $acc_mb, $t1.16b + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 + eor $acc_mb, $acc_mb, $acc_hb + + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 + + pmull $t1.1q, $acc_m.1d, $mod_constant.1d + + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 + ext $acc_mb, $acc_mb, $acc_mb, #8 + + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 + + aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 + eor $acc_lb, $acc_lb, $t1.16b + + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 + + aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 + + aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 + + aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 + eor $acc_lb, $acc_lb, $acc_mb + .L256_enc_tail: @ TAIL + + ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag + sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low + eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high + + cmp $main_end_input_ptr, #48 + fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low + + fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high + + eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result + b.gt .L256_enc_blocks_more_than_3 + + cmp $main_end_input_ptr, #32 + mov $ctr3b, $ctr2b + movi $acc_l.8b, #0 + + movi $acc_h.8b, #0 + sub $rctr32w, $rctr32w, #1 + + mov $ctr2b, $ctr1b + movi $acc_m.8b, #0 + b.gt .L256_enc_blocks_more_than_2 + + mov $ctr3b, $ctr1b + sub $rctr32w, $rctr32w, #1 + cmp $main_end_input_ptr, #16 + + b.gt .L256_enc_blocks_more_than_1 + + sub $rctr32w, $rctr32w, #1 + b .L256_enc_blocks_less_than_1 + .L256_enc_blocks_more_than_3: @ blocks left > 3 + st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result + + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + rev64 $res0b, $res1b @ GHASH final-3 block + + eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high + + mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid + fmov $res1d, $input_l0 @ AES final-2 block - mov low + + fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid + movi $t0.8b, #0 @ suppress further partial tag feed in + + mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high + + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid + eor $res1b, $res1b, $ctr1b @ AES final-2 block - result + .L256_enc_blocks_more_than_2: @ blocks left > 2 + + st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result + + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + rev64 $res0b, $res1b @ GHASH final-2 block + + eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + fmov $res1d, $input_l0 @ AES final-1 block - mov low + eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high + + fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high + + movi $t0.8b, #0 @ suppress further partial tag feed in + + pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high + mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid + + pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid + + eor $res1b, $res1b, $ctr2b @ AES final-1 block - result + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high + + pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid + .L256_enc_blocks_more_than_1: @ blocks left > 1 + + st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result + + rev64 $res0b, $res1b @ GHASH final-1 block + + ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + movi $t0.8b, #0 @ suppress further partial tag feed in + + eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low + mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid + + pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high + eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high + + ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid + fmov $res1d, $input_l0 @ AES final block - mov low + + fmov $res1.d[1], $input_h0 @ AES final block - mov high + + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid + + pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low + + eor $res1b, $res1b, $ctr3b @ AES final block - result + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low + .L256_enc_blocks_less_than_1: @ blocks left <= 1 + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff + sub $bit_length, $bit_length, #128 @ bit_length -= 128 + + neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) + ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored + + mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block + cmp $bit_length, #64 + + csel $input_l0, $rk14_l, $rk14_h, lt + csel $input_h0, $rk14_h, xzr, lt + + fmov $ctr0d, $input_l0 @ ctr0b is mask for last block + + fmov $ctr0.d[1], $input_h0 + + and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits + + rev64 $res0b, $res1b @ GHASH final block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing + + pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high + mov $t0d, $res0.d[1] @ GHASH final block - mid +#ifndef __AARCH64EB__ + rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif + + pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high + eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid + + pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low + + eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid + movi $mod_constant.8b, #0xc2 + + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + str $ctr32w, [$counter, #12] @ store the updated counter + + st1 { $res1b}, [$output_ptr] @ store all 16B + eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L256_enc_ret: + mov w0, #0x0 + ret +.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel +___ + +{ +my $t8="v4"; +my $t8d="d4"; +my $t9="v6"; +my $t9d="d6"; +######################################################################################### +# size_t aes_gcm_dec_256_kernel(const unsigned char *in, +# size_t len, +# unsigned char *out, +# const void *key, +# unsigned char ivec[16], +# u64 *Xi); +# +$code.=<<___; +.global aes_gcm_dec_256_kernel +.type aes_gcm_dec_256_kernel,%function +.align 4 +aes_gcm_dec_256_kernel: + cbz x1, .L256_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp d8, d9, [sp, #48] + stp d10, d11, [sp, #64] + stp d12, d13, [sp, #80] + stp d14, d15, [sp, #96] + + lsr $main_end_input_ptr, $bit_length, #3 @ byte_len + mov $len, $main_end_input_ptr + ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 +#ifdef __AARCH64EB__ + ror $rk14_h, $rk14_h, #32 + ror $rk14_l, $rk14_l, #32 +#endif + ld1 {$rk0s}, [$cc], #16 @ load rk0 + sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 + + ld1 {$rk1s}, [$cc], #16 @ load rk1 + and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + + add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr + ld1 {$rk2s}, [$cc], #16 @ load rk2 + + lsr $rctr32x, $ctr96_t32x, #32 + ld1 {$rk3s}, [$cc], #16 @ load rk3 + orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w + + ld1 {$rk4s}, [$cc], #16 @ load rk4 + add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + rev $rctr32w, $rctr32w @ rev_ctr32 + + add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 + fmov $ctr3d, $ctr96_b64x @ CTR block 3 + + rev $ctr32w, $rctr32w @ CTR block 1 + add $rctr32w, $rctr32w, #1 @ CTR block 1 + fmov $ctr1d, $ctr96_b64x @ CTR block 1 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 + ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible + + fmov $ctr1.d[1], $ctr32x @ CTR block 1 + rev $ctr32w, $rctr32w @ CTR block 2 + add $rctr32w, $rctr32w, #1 @ CTR block 2 + + fmov $ctr2d, $ctr96_b64x @ CTR block 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 + + fmov $ctr2.d[1], $ctr32x @ CTR block 2 + rev $ctr32w, $rctr32w @ CTR block 3 + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 + ld1 {$rk5s}, [$cc], #16 @ load rk5 + + fmov $ctr3.d[1], $ctr32x @ CTR block 3 + add $rctr32w, $rctr32w, #1 @ CTR block 3 + + ld1 {$rk6s}, [$cc], #16 @ load rk6 + + ld1 {$rk7s}, [$cc], #16 @ load rk7 + + ld1 {$rk8s}, [$cc], #16 @ load rk8 + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 + ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ + ext $h3b, $h3b, $h3b, #8 +#endif + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 + ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ + ext $h4b, $h4b, $h4b, #8 +#endif + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 + ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ + ext $h2b, $h2b, $h2b, #8 +#endif + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 + ld1 {$rk9s}, [$cc], #16 @ load rk9 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 + ld1 { $acc_lb}, [$current_tag] + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 + ld1 {$rk10s}, [$cc], #16 @ load rk10 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 + ld1 {$rk11s}, [$cc], #16 @ load rk11 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 + ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ + ext $h1b, $h1b, $h1b, #8 +#endif + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 + ld1 {$rk12s}, [$cc], #16 @ load rk12 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 + ld1 {$rk13s}, [$cc], #16 @ load rk13 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 + + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 + + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 + + trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h + + trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l + + trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h + trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l + + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 + + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 + + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 + + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 + eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k + + aese $ctr1b, $rk13 @ AES block 1 - round 13 + + aese $ctr2b, $rk13 @ AES block 2 - round 13 + eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k + + aese $ctr3b, $rk13 @ AES block 3 - round 13 + + aese $ctr0b, $rk13 @ AES block 0 - round 13 + b.ge .L256_dec_tail @ handle tail + + ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext + + rev $ctr32w, $rctr32w @ CTR block 4 + + eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result + + eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result + rev64 $res1b, $res1b @ GHASH block 1 + ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext + + mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high + + mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low + rev64 $res0b, $res0b @ GHASH block 0 + add $rctr32w, $rctr32w, #1 @ CTR block 4 + + fmov $ctr0d, $ctr96_b64x @ CTR block 4 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 + + fmov $ctr0.d[1], $ctr32x @ CTR block 4 + rev $ctr32w, $rctr32w @ CTR block 5 + add $rctr32w, $rctr32w, #1 @ CTR block 5 + + mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low + + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 + mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high + eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result + fmov $ctr1d, $ctr96_b64x @ CTR block 5 + + ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext + + fmov $ctr1.d[1], $ctr32x @ CTR block 5 + rev $ctr32w, $rctr32w @ CTR block 6 + add $rctr32w, $rctr32w, #1 @ CTR block 6 + + eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 + + eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif + stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result + + eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result + cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + b.ge .L256_dec_prepretail @ do prepretail + + .L256_dec_main_loop: @ main loop start + mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 + + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 + eor $res0b, $res0b, $acc_lb @ PRE 1 + rev $ctr32w, $rctr32w @ CTR block 4k+7 + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + rev64 $res2b, $res2b @ GHASH block 4k+2 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + rev64 $res3b, $res3b @ GHASH block 4k+3 + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + rev $ctr32w, $rctr32w @ CTR block 4k+8 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + movi $mod_constant.8b, #0xc2 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext + + aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 + stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext + + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 + ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext + + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 + mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 + mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low + + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 + fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 + + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 + fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result + rev $ctr32w, $rctr32w @ CTR block 4k+9 + + aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 + cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + + add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 + + eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high + eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 + mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low + + fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 + rev $ctr32w, $rctr32w @ CTR block 4k+10 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 + + aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 + + rev64 $res1b, $res1b @ GHASH block 4k+5 + eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result + + eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif + stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result + + rev64 $res0b, $res0b @ GHASH block 4k+4 + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + b.lt .L256_dec_main_loop + + + .L256_dec_prepretail: @ PREPRETAIL + ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 + mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low + eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result + + aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 + mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high + + aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 + fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 + + fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 + rev $ctr32w, $rctr32w @ CTR block 4k+7 + eor $res0b, $res0b, $acc_lb @ PRE 1 + + rev64 $res2b, $res2b @ GHASH block 4k+2 + orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 + mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low + + aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 + mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low + mov $t0d, $res0.d[1] @ GHASH block 4k - mid + fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high + fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 + + aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 + mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid + + aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 + eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid + + pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high + + aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 + rev64 $res3b, $res3b @ GHASH block 4k+3 + + aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 + + pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid + eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high + + pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low + + aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 + mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 + + aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 + eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low + + aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 + + aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 + mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid + + aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 + eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid + + pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low + + aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 + + aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 + eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid + + pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid + + aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 + eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low + + aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 + + pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high + eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid + + pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high + + aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 + ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 + + aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 + eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high + + pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low + + aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 + mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 + + pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 + eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid + + aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 + + aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 + eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid + + aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 + + aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 + movi $mod_constant.8b, #0xc2 + + aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 + eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low + + pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 + eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high + + aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 + + aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 + eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid + + aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 + + aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 + + aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 + + aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 + + aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 + + aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 + + aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 + eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif + aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 + eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif + aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 + add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 + + aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 + eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif + + aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif + + aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 + stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result + + aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 + stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result + + aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 + + aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 + + aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 + + aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + .L256_dec_tail: @ TAIL + + sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext + + eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result + + mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low + + mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high + ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag + + cmp $main_end_input_ptr, #48 + + eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + + eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + b.gt .L256_dec_blocks_more_than_3 + + sub $rctr32w, $rctr32w, #1 + mov $ctr3b, $ctr2b + movi $acc_m.8b, #0 + + movi $acc_l.8b, #0 + cmp $main_end_input_ptr, #32 + + movi $acc_h.8b, #0 + mov $ctr2b, $ctr1b + b.gt .L256_dec_blocks_more_than_2 + + sub $rctr32w, $rctr32w, #1 + + mov $ctr3b, $ctr1b + cmp $main_end_input_ptr, #16 + b.gt .L256_dec_blocks_more_than_1 + + sub $rctr32w, $rctr32w, #1 + b .L256_dec_blocks_less_than_1 + .L256_dec_blocks_more_than_3: @ blocks left > 3 + rev64 $res0b, $res1b @ GHASH final-3 block + ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext + + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result + + mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result + + mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid + + mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low + + mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid + + movi $t0.8b, #0 @ suppress further partial tag feed in + + pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high + + pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid + eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + + pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low + eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + .L256_dec_blocks_more_than_2: @ blocks left > 2 + + rev64 $res0b, $res1b @ GHASH final-2 block + ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result + + eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result + + mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid + + pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low + + pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid + mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low + + mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low + movi $t0.8b, #0 @ suppress further partial tag feed in + + pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high + eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid + eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + .L256_dec_blocks_more_than_1: @ blocks left > 1 + + stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result + rev64 $res0b, $res1b @ GHASH final-1 block + + ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + movi $t0.8b, #0 @ suppress further partial tag feed in + + mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid + + eor $ctr0b, $res1b, $ctr3b @ AES final block - result + + pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high + + eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid + + pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low + mov $output_l0, $ctr0.d[0] @ AES final block - mov low + + ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid + + mov $output_h0, $ctr0.d[1] @ AES final block - mov high + + pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid + eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif + eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high + + eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid + eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif + .L256_dec_blocks_less_than_1: @ blocks left <= 1 + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff + + sub $bit_length, $bit_length, #128 @ bit_length -= 128 + mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff + + ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite + neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) + + and $bit_length, $bit_length, #127 @ bit_length %= 128 + + lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block + cmp $bit_length, #64 + + csel $ctr32x, $rk14_l, $rk14_h, lt + csel $ctr96_b64x, $rk14_h, xzr, lt + + fmov $ctr0d, $ctr32x @ ctr0b is mask for last block + and $output_l0, $output_l0, $ctr32x + + mov $ctr0.d[1], $ctr96_b64x + bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes + +#ifndef __AARCH64EB__ + rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif + + bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes + + orr $output_l0, $output_l0, $end_input_ptr + + and $output_h0, $output_h0, $ctr96_b64x + + orr $output_h0, $output_h0, $main_end_input_ptr + + and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits + + rev64 $res0b, $res1b @ GHASH final block + + eor $res0b, $res0b, $t0.16b @ feed in partial tag + + pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low + + mov $t0d, $res0.d[1] @ GHASH final block - mid + + eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid + + pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high + + pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid + + eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high + + eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low + + eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid + movi $mod_constant.8b, #0xc2 + + eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up + + shl $mod_constantd, $mod_constantd, #56 @ mod_constant + + eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up + + pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid + + ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment + + eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid + + eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid + + pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low + + ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment + + eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low + + stp $output_l0, $output_h0, [$output_ptr] + + str $ctr32w, [$counter, #12] @ store the updated counter + + eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low + ext $acc_lb, $acc_lb, $acc_lb, #8 + rev64 $acc_lb, $acc_lb + mov x0, $len + st1 { $acc_l.16b }, [$current_tag] + + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp d8, d9, [sp, #48] + ldp d10, d11, [sp, #64] + ldp d12, d13, [sp, #80] + ldp d14, d15, [sp, #96] + ldp x19, x20, [sp], #112 + ret + +.L256_dec_ret: + mov w0, #0x0 + ret +.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel +___ +} +} + +$code.=<<___; +.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#endif +___ + +if ($flavour =~ /64/) { ######## 64-bit code + sub unvmov { + my $arg=shift; + + $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && + sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, + $3<8?$3:$3+8,($4 eq "lo")?0:1; + } + foreach(split("\n",$code)) { + s/@\s/\/\//o; # old->new style commentary + print $_,"\n"; + } +} else { ######## 32-bit code + sub unvdup32 { + my $arg=shift; + + $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + sub unvpmullp64 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { + my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + $word |= 0x00010001 if ($mnemonic =~ "2"); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older%%%% + # assemblers don't implement it:-( + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } + + foreach(split("\n",$code)) { + s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers + s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers + s/\/\/\s?/@ /o; # new->old style commentary + + # fix up remaining new-style suffixes + s/\],#[0-9]+/]!/o; + + s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or + s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/^(\s+)b\./$1b/o or + s/^(\s+)ret/$1bx\tlr/o; + + if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { + print " it $2\n"; + } + s/__AARCH64E([BL])__/__ARME$1__/go; + print $_,"\n"; + } +} + +close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl index 60f03e4fe25b..eaf4d9c755fb 100644 --- a/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -40,9 +40,10 @@ # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -70,7 +71,8 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0 $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; if ($avx>1) {{{ diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl new file mode 100644 index 000000000000..c350e9015550 --- /dev/null +++ b/crypto/modes/asm/ghash-alpha.pl @@ -0,0 +1,467 @@ +#! /usr/bin/env perl +# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Even though +# loops are aggressively modulo-scheduled in respect to references to +# Htbl and Z.hi updates for 8 cycles per byte, measured performance is +# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic +# scheduling "glitch," because uprofile(1) indicates uniform sample +# distribution, as if all instruction bundles execute in 1.5 cycles. +# Meaning that it could have been even faster, yet 12 cycles is ~60% +# better than gcc-generated code and ~80% than code generated by vendor +# compiler. + +$cnt="v0"; # $0 +$t0="t0"; +$t1="t1"; +$t2="t2"; +$Thi0="t3"; # $4 +$Tlo0="t4"; +$Thi1="t5"; +$Tlo1="t6"; +$rem="t7"; # $8 +################# +$Xi="a0"; # $16, input argument block +$Htbl="a1"; +$inp="a2"; +$len="a3"; +$nlo="a4"; # $20 +$nhi="a5"; +$Zhi="t8"; +$Zlo="t9"; +$Xhi="t10"; # $24 +$Xlo="t11"; +$remp="t12"; +$rem_4bit="AT"; # $28 + +{ my $N; + sub loop() { + + $N++; +$code.=<<___; +.align 4 + extbl $Xlo,7,$nlo + and $nlo,0xf0,$nhi + sll $nlo,4,$nlo + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Zlo,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Zhi,0($nlo) + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,6(zero) + extbl $Xlo,6,$nlo + + ldq $Tlo1,8($nhi) + s8addq $remp,$rem_4bit,$remp + ldq $Thi1,0($nhi) + srl $Zlo,4,$Zlo + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + and $nlo,0xf0,$nhi + + xor $Tlo1,$Zlo,$Zlo + sll $nlo,4,$nlo + xor $Thi1,$Zhi,$Zhi + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Tlo0,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Thi0,0($nlo) + +.Looplo$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xlo,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Looplo$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,7(zero) + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + unop + + +.Loophi$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Loophi$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + + xor $Tlo0,$Zlo,$Zlo + xor $Thi0,$Zhi,$Zhi + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + xor $t0,$Zlo,$Zlo + xor $rem,$Zhi,$Zhi +___ +}} + +$code=<<___; +#ifdef __linux__ +#include <asm/regdef.h> +#else +#include <asm.h> +#include <regdef.h> +#endif + +.text + +.set noat +.set noreorder +.globl gcm_gmult_4bit +.align 4 +.ent gcm_gmult_4bit +gcm_gmult_4bit: + .frame sp,0,ra + .prologue 0 + + ldq $Xlo,8($Xi) + ldq $Xhi,0($Xi) + + bsr $t0,picmeup + nop +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + ret (ra) +.end gcm_gmult_4bit +___ + +$inhi="s0"; +$inlo="s1"; + +$code.=<<___; +.globl gcm_ghash_4bit +.align 4 +.ent gcm_ghash_4bit +gcm_ghash_4bit: + lda sp,-32(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + .mask 0x04000600,-32 + .frame sp,32,ra + .prologue 0 + + ldq_u $inhi,0($inp) + ldq_u $Thi0,7($inp) + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + ldq $Xhi,0($Xi) + ldq $Xlo,8($Xi) + + bsr $t0,picmeup + nop + +.Louter: + extql $inhi,$inp,$inhi + extqh $Thi0,$inp,$Thi0 + or $inhi,$Thi0,$inhi + lda $inp,16($inp) + + extql $inlo,$inp,$inlo + extqh $Tlo0,$inp,$Tlo0 + or $inlo,$Tlo0,$inlo + subq $len,16,$len + + xor $Xlo,$inlo,$Xlo + xor $Xhi,$inhi,$Xhi +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + beq $len,.Ldone + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + ldq_u $inhi,0($inp) + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + ldq_u $Thi0,7($inp) + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + br zero,.Louter + +.Ldone: + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + .set noreorder + /*ldq ra,0(sp)*/ + ldq s0,8(sp) + ldq s1,16(sp) + lda sp,32(sp) + ret (ra) +.end gcm_ghash_4bit + +.align 4 +.ent picmeup +picmeup: + .frame sp,0,$t0 + .prologue 0 + br $rem_4bit,.Lpic +.Lpic: lda $rem_4bit,12($rem_4bit) + ret ($t0) +.end picmeup + nop +rem_4bit: + .long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16 + .long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16 + .long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16 + .long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16 +.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>" +.align 4 + +___ +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT or die "error closing STDOUT: $!"; + diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index d84ac6f2bf43..044f86f143f9 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -78,9 +78,10 @@ # *native* byte order on current platform. See gcm128.c for working # example... -$flavour = shift; -if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; @@ -88,9 +89,10 @@ if ($flavour && $flavour ne "void") { ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; - open STDOUT,"| \"$^X\" $xlate $flavour $output"; + open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; } else { - open STDOUT,">$output"; + $output and open STDOUT,">$output"; } $Xi="r0"; # argument block @@ -142,7 +144,6 @@ ___ $code=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #define ldrplb ldrbpl @@ -154,6 +155,8 @@ $code=<<___; .code 32 #endif +.text + .type rem_4bit,%object .align 5 rem_4bit: diff --git a/crypto/modes/asm/ghash-c64xplus.pl b/crypto/modes/asm/ghash-c64xplus.pl index 5826dcbdfe64..ded2f779e669 100644 --- a/crypto/modes/asm/ghash-c64xplus.pl +++ b/crypto/modes/asm/ghash-c64xplus.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -26,8 +26,7 @@ # better, because theoretical [though not necessarily achievable] # estimate for "4-bit" table-driven implementation is ~12 cycles. -while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$output = pop and open STDOUT,">$output"; ($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl index 6e29f995fa85..74b88176b842 100755 --- a/crypto/modes/asm/ghash-ia64.pl +++ b/crypto/modes/asm/ghash-ia64.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html diff --git a/crypto/modes/asm/ghash-parisc.pl b/crypto/modes/asm/ghash-parisc.pl index d798b7abf490..7b5cc2f16e38 100644 --- a/crypto/modes/asm/ghash-parisc.pl +++ b/crypto/modes/asm/ghash-parisc.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -27,9 +27,12 @@ # # Special thanks to polarhome.com for providing HP-UX account. -$flavour = shift; -$output = shift; -open STDOUT,">$output"; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$output and open STDOUT,">$output"; if ($flavour =~ /64/) { $LEVEL ="2.0W"; diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl index 2f3ca295bdca..ba9c5b4a4f4f 100644 --- a/crypto/modes/asm/ghash-s390x.pl +++ b/crypto/modes/asm/ghash-s390x.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -44,7 +44,10 @@ # it's actually almost 2 times slower. Which is the reason why # KIMD-GHASH is not used in gcm_gmult_4bit. -$flavour = shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour =~ /3[12]/) { $SIZE_T=4; @@ -54,8 +57,7 @@ if ($flavour =~ /3[12]/) { $g="g"; } -while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -open STDOUT,">$output"; +$output and open STDOUT,">$output"; $softonly=0; diff --git a/crypto/modes/asm/ghash-sparcv9.pl b/crypto/modes/asm/ghash-sparcv9.pl index ccebc74b4e97..e35064237f46 100644 --- a/crypto/modes/asm/ghash-sparcv9.pl +++ b/crypto/modes/asm/ghash-sparcv9.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -53,8 +53,7 @@ # saturates at ~15.5x single-process result on 8-core processor, # or ~20.5GBps per 2.85GHz socket. -$output=pop; -open STDOUT,">$output"; +$output=pop and open STDOUT,">$output"; $frame="STACK_FRAME"; $bias="STACK_BIAS"; @@ -81,7 +80,10 @@ $inp="%i2"; $len="%i3"; $code.=<<___; -#include "sparc_arch.h" +#ifndef __ASSEMBLER__ +# define __ASSEMBLER__ 1 +#endif +#include "crypto/sparc_arch.h" #ifdef __arch64__ .register %g2,#scratch diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 4183d99b68b1..c796107c7f97 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -135,8 +135,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -$output=pop; -open STDOUT,">$output"; +$output=pop and open STDOUT,">$output"; &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 9bdba41d1de5..6709f96492ed 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -90,9 +90,10 @@ # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -120,7 +121,8 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0 $avx = ($2>=3.0) + ($2>3.0); } -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; $do4xaggr=1; @@ -239,6 +241,7 @@ $code=<<___; .align 16 gcm_gmult_4bit: .cfi_startproc + endbranch push %rbx .cfi_push %rbx push %rbp # %rbp and others are pushed exclusively in @@ -286,6 +289,7 @@ $code.=<<___; .align 16 gcm_ghash_4bit: .cfi_startproc + endbranch push %rbx .cfi_push %rbx push %rbp @@ -612,6 +616,7 @@ $code.=<<___; .align 16 gcm_gmult_clmul: .cfi_startproc + endbranch .L_gmult_clmul: movdqu ($Xip),$Xi movdqa .Lbswap_mask(%rip),$T3 @@ -663,6 +668,7 @@ $code.=<<___; .align 32 gcm_ghash_clmul: .cfi_startproc + endbranch .L_ghash_clmul: ___ $code.=<<___ if ($win64); @@ -1166,6 +1172,7 @@ $code.=<<___; .align 32 gcm_gmult_avx: .cfi_startproc + endbranch jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx @@ -1177,6 +1184,7 @@ $code.=<<___; .align 32 gcm_ghash_avx: .cfi_startproc + endbranch ___ if ($avx) { my ($Xip,$Htbl,$inp,$len)=@_4args; diff --git a/crypto/modes/asm/ghashp8-ppc.pl b/crypto/modes/asm/ghashp8-ppc.pl index 447472cb4592..20f611dabd43 100755 --- a/crypto/modes/asm/ghashp8-ppc.pl +++ b/crypto/modes/asm/ghashp8-ppc.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -32,8 +32,10 @@ # aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb). # POWER9 delivers 0.51 cpb. -$flavour=shift; -$output =shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour =~ /64/) { $SIZE_T=8; @@ -61,7 +63,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; -open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; +open STDOUT,"| $^X $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl index d0e398b50ae0..b3d94041729e 100644 --- a/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/modes/asm/ghashv8-armx.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -42,18 +42,22 @@ # Denver 0.51 0.65 6.02 # Mongoose 0.65 1.10 8.06 # Kryo 0.76 1.16 8.00 +# ThunderX2 1.05 # # (*) presented for reference/comparison purposes; -$flavour = shift; -$output = shift; +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; $Xi="x0"; # argument block @@ -66,18 +70,26 @@ $inc="x12"; { my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); +my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); $code=<<___; #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text ___ -$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); -$code.=<<___ if ($flavour !~ /64/); +$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour !~ /64/); .fpu neon -.code 32 -#undef __thumb2__ +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) $_byte c,0xef,a,b +#else +.code 32 +# define INST(a,b,c,d) $_byte a,b,c,0xf2 +#endif + +.text ___ ################################################################################ @@ -732,6 +744,9 @@ if ($flavour =~ /64/) { ######## 64-bit code s/\.[uisp]?64//o and s/\.16b/\.2d/go; s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; + # Switch preprocessor checks to aarch64 versions. + s/__ARME([BL])__/__AARCH64E$1__/go; + print $_,"\n"; } } else { ######## 32-bit code @@ -752,7 +767,7 @@ if ($flavour =~ /64/) { ######## 64-bit code # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; @@ -767,13 +782,17 @@ if ($flavour =~ /64/) { ######## 64-bit code # fix up remaining new-style suffixes s/\],#[0-9]+/]!/o; - s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or s/^(\s+)b\./$1b/o or s/^(\s+)ret/$1bx\tlr/o; + if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { + print " it $2\n"; + } + print $_,"\n"; } } |