aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEnji Cooper <ngie@FreeBSD.org>2026-02-01 17:05:55 +0000
committerEnji Cooper <ngie@FreeBSD.org>2026-02-01 17:05:55 +0000
commite6c8997a8958c7aaec8e266d2eeefbfaa137e218 (patch)
tree568ad37498693f81a2c62c5ec1b53747e5a8c9e9
parenta97ed3a39c1044dd1b8056d68a76de74821f2bff (diff)
OpenSSL: commit sys/crypto changes for 3.5.5
These files were changed as part of the 3.5.4 -> 3.5.5 upgrade. Please see the upstream release notes linked in 1731fc70f7344af08db49b06c63c963fa12ee354, et al, for more details. MFC after: 6 days MFC with: 1731fc70f7344af08db49b06c63c963fa12ee354 Fixes: 1731fc70f7344af08d ("OpenSSL: update vendor sources to match 3.5.5 content")
-rw-r--r--sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S14
-rw-r--r--sys/crypto/openssl/arm_arch.h369
-rw-r--r--sys/crypto/openssl/powerpc/aes-gcm-ppc.S2118
-rw-r--r--sys/crypto/openssl/powerpc64/aes-gcm-ppc.S2119
-rw-r--r--sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S2119
5 files changed, 2850 insertions, 3889 deletions
diff --git a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
index 5627d6d1c6b4..b8c728e68683 100644
--- a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
+++ b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S
@@ -1,5 +1,5 @@
/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */
-// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
@@ -35,13 +35,25 @@ _vpsm4_ex_consts:
.Lshuffles:
.quad 0x0B0A090807060504,0x030201000F0E0D0C
.Lxts_magic:
+#ifndef __AARCH64EB__
.quad 0x0101010101010187,0x0101010101010101
+#else
+.quad 0x0101010101010101,0x0101010101010187
+#endif
.Lsbox_magic:
+#ifndef __AARCH64EB__
.quad 0x0b0e0104070a0d00,0x0306090c0f020508
.quad 0x62185a2042387a00,0x22581a6002783a40
.quad 0x15df62a89e54e923,0xc10bb67c4a803df7
.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
+#else
+.quad 0x0306090c0f020508,0x0b0e0104070a0d00
+.quad 0x22581a6002783a40,0x62185a2042387a00
+.quad 0xc10bb67c4a803df7,0x15df62a89e54e923
+.quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300
+.quad 0xe383c1a1fe9edcbc,0x6404462679195b3b
+#endif
.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
.size _vpsm4_ex_consts,.-_vpsm4_ex_consts
diff --git a/sys/crypto/openssl/arm_arch.h b/sys/crypto/openssl/arm_arch.h
index acd8aee4d519..d570d1eba6c1 100644
--- a/sys/crypto/openssl/arm_arch.h
+++ b/sys/crypto/openssl/arm_arch.h
@@ -8,87 +8,80 @@
*/
#ifndef OSSL_CRYPTO_ARM_ARCH_H
-# define OSSL_CRYPTO_ARM_ARCH_H
-
-# if !defined(__ARM_ARCH__)
-# if defined(__CC_ARM)
-# define __ARM_ARCH__ __TARGET_ARCH_ARM
-# if defined(__BIG_ENDIAN)
-# define __ARMEB__
-# else
-# define __ARMEL__
-# endif
-# elif defined(__GNUC__)
-# if defined(__aarch64__)
-# define __ARM_ARCH__ 8
- /*
- * Why doesn't gcc define __ARM_ARCH__? Instead it defines
- * bunch of below macros. See all_architectures[] table in
- * gcc/config/arm/arm.c. On a side note it defines
- * __ARMEL__/__ARMEB__ for little-/big-endian.
- */
-# elif defined(__ARM_ARCH)
-# define __ARM_ARCH__ __ARM_ARCH
-# elif defined(__ARM_ARCH_8A__)
-# define __ARM_ARCH__ 8
-# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \
- defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \
- defined(__ARM_ARCH_7EM__)
-# define __ARM_ARCH__ 7
-# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
- defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \
- defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \
- defined(__ARM_ARCH_6T2__)
-# define __ARM_ARCH__ 6
-# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \
- defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \
- defined(__ARM_ARCH_5TEJ__)
-# define __ARM_ARCH__ 5
-# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
-# define __ARM_ARCH__ 4
-# else
-# error "unsupported ARM architecture"
-# endif
-# elif defined(__ARM_ARCH)
-# define __ARM_ARCH__ __ARM_ARCH
-# endif
-# endif
-
-# if !defined(__ARM_MAX_ARCH__)
-# define __ARM_MAX_ARCH__ __ARM_ARCH__
-# endif
-
-# if __ARM_MAX_ARCH__<__ARM_ARCH__
-# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
-# elif __ARM_MAX_ARCH__!=__ARM_ARCH__
-# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__)
-# error "can't build universal big-endian binary"
-# endif
-# endif
-
-# ifndef __ASSEMBLER__
+#define OSSL_CRYPTO_ARM_ARCH_H
+
+#if !defined(__ARM_ARCH__)
+#if defined(__CC_ARM)
+#define __ARM_ARCH__ __TARGET_ARCH_ARM
+#if defined(__BIG_ENDIAN)
+#define __ARMEB__
+#else
+#define __ARMEL__
+#endif
+#elif defined(__GNUC__)
+#if defined(__aarch64__)
+#define __ARM_ARCH__ 8
+/*
+ * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+ * bunch of below macros. See all_architectures[] table in
+ * gcc/config/arm/arm.c. On a side note it defines
+ * __ARMEL__/__ARMEB__ for little-/big-endian.
+ */
+#elif defined(__ARM_ARCH)
+#define __ARM_ARCH__ __ARM_ARCH
+#elif defined(__ARM_ARCH_8A__)
+#define __ARM_ARCH__ 8
+#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__)
+#define __ARM_ARCH__ 7
+#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)
+#define __ARM_ARCH__ 6
+#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__)
+#define __ARM_ARCH__ 5
+#elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
+#define __ARM_ARCH__ 4
+#else
+#error "unsupported ARM architecture"
+#endif
+#elif defined(__ARM_ARCH)
+#define __ARM_ARCH__ __ARM_ARCH
+#endif
+#endif
+
+#if !defined(__ARM_MAX_ARCH__)
+#define __ARM_MAX_ARCH__ __ARM_ARCH__
+#endif
+
+#if __ARM_MAX_ARCH__ < __ARM_ARCH__
+#error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__"
+#elif __ARM_MAX_ARCH__ != __ARM_ARCH__
+#if __ARM_ARCH__ < 7 && __ARM_MAX_ARCH__ >= 7 && defined(__ARMEB__)
+#error "can't build universal big-endian binary"
+#endif
+#endif
+
+#ifndef __ASSEMBLER__
extern unsigned int OPENSSL_armcap_P;
extern unsigned int OPENSSL_arm_midr;
extern unsigned int OPENSSL_armv8_rsa_neonized;
-# endif
-
-# define ARMV7_NEON (1<<0)
-# define ARMV7_TICK (1<<1)
-# define ARMV8_AES (1<<2)
-# define ARMV8_SHA1 (1<<3)
-# define ARMV8_SHA256 (1<<4)
-# define ARMV8_PMULL (1<<5)
-# define ARMV8_SHA512 (1<<6)
-# define ARMV8_CPUID (1<<7)
-# define ARMV8_RNG (1<<8)
-# define ARMV8_SM3 (1<<9)
-# define ARMV8_SM4 (1<<10)
-# define ARMV8_SHA3 (1<<11)
-# define ARMV8_UNROLL8_EOR3 (1<<12)
-# define ARMV8_SVE (1<<13)
-# define ARMV8_SVE2 (1<<14)
-# define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15)
-# define ARMV8_UNROLL12_EOR3 (1<<16)
+#endif
+
+#define ARMV7_NEON (1 << 0)
+#define ARMV7_TICK (1 << 1)
+#define ARMV8_AES (1 << 2)
+#define ARMV8_SHA1 (1 << 3)
+#define ARMV8_SHA256 (1 << 4)
+#define ARMV8_PMULL (1 << 5)
+#define ARMV8_SHA512 (1 << 6)
+#define ARMV8_CPUID (1 << 7)
+#define ARMV8_RNG (1 << 8)
+#define ARMV8_SM3 (1 << 9)
+#define ARMV8_SM4 (1 << 10)
+#define ARMV8_SHA3 (1 << 11)
+#define ARMV8_UNROLL8_EOR3 (1 << 12)
+#define ARMV8_SVE (1 << 13)
+#define ARMV8_SVE2 (1 << 14)
+#define ARMV8_HAVE_SHA3_AND_WORTH_USING (1 << 15)
+#define ARMV8_UNROLL12_EOR3 (1 << 16)
/*
* MIDR_EL1 system register
@@ -100,120 +93,116 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
*
*/
-# define ARM_CPU_IMP_ARM 0x41
-# define HISI_CPU_IMP 0x48
-# define ARM_CPU_IMP_APPLE 0x61
-# define ARM_CPU_IMP_MICROSOFT 0x6D
-# define ARM_CPU_IMP_AMPERE 0xC0
-
-# define ARM_CPU_PART_CORTEX_A72 0xD08
-# define ARM_CPU_PART_N1 0xD0C
-# define ARM_CPU_PART_V1 0xD40
-# define ARM_CPU_PART_N2 0xD49
-# define HISI_CPU_PART_KP920 0xD01
-# define ARM_CPU_PART_V2 0xD4F
-
-# define APPLE_CPU_PART_M1_ICESTORM 0x022
-# define APPLE_CPU_PART_M1_FIRESTORM 0x023
-# define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024
-# define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025
-# define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028
-# define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029
-# define APPLE_CPU_PART_M2_BLIZZARD 0x032
-# define APPLE_CPU_PART_M2_AVALANCHE 0x033
-# define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034
-# define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035
-# define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038
-# define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039
-
-# define MICROSOFT_CPU_PART_COBALT_100 0xD49
-
-# define MIDR_PARTNUM_SHIFT 4
-# define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT)
-# define MIDR_PARTNUM(midr) \
- (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
-
-# define MIDR_IMPLEMENTER_SHIFT 24
-# define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT)
-# define MIDR_IMPLEMENTER(midr) \
- (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT)
-
-# define MIDR_ARCHITECTURE_SHIFT 16
-# define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT)
-# define MIDR_ARCHITECTURE(midr) \
- (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
-
-# define MIDR_CPU_MODEL_MASK \
- (MIDR_IMPLEMENTER_MASK | \
- MIDR_PARTNUM_MASK | \
- MIDR_ARCHITECTURE_MASK)
-
-# define MIDR_CPU_MODEL(imp, partnum) \
- (((imp) << MIDR_IMPLEMENTER_SHIFT) | \
- (0xfU << MIDR_ARCHITECTURE_SHIFT) | \
- ((partnum) << MIDR_PARTNUM_SHIFT))
-
-# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
- (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
+#define ARM_CPU_IMP_ARM 0x41
+#define HISI_CPU_IMP 0x48
+#define ARM_CPU_IMP_APPLE 0x61
+#define ARM_CPU_IMP_MICROSOFT 0x6D
+#define ARM_CPU_IMP_AMPERE 0xC0
+
+#define ARM_CPU_PART_CORTEX_A72 0xD08
+#define ARM_CPU_PART_N1 0xD0C
+#define ARM_CPU_PART_V1 0xD40
+#define ARM_CPU_PART_N2 0xD49
+#define HISI_CPU_PART_KP920 0xD01
+#define ARM_CPU_PART_V2 0xD4F
+
+#define APPLE_CPU_PART_M1_ICESTORM 0x022
+#define APPLE_CPU_PART_M1_FIRESTORM 0x023
+#define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024
+#define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025
+#define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028
+#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029
+#define APPLE_CPU_PART_M2_BLIZZARD 0x032
+#define APPLE_CPU_PART_M2_AVALANCHE 0x033
+#define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034
+#define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035
+#define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038
+#define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039
+
+#define MICROSOFT_CPU_PART_COBALT_100 0xD49
+
+#define MIDR_PARTNUM_SHIFT 4
+#define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT)
+#define MIDR_PARTNUM(midr) \
+ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT)
+
+#define MIDR_IMPLEMENTER_SHIFT 24
+#define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT)
+#define MIDR_IMPLEMENTER(midr) \
+ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT)
+
+#define MIDR_ARCHITECTURE_SHIFT 16
+#define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT)
+#define MIDR_ARCHITECTURE(midr) \
+ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT)
+
+#define MIDR_CPU_MODEL_MASK \
+ (MIDR_IMPLEMENTER_MASK | MIDR_PARTNUM_MASK | MIDR_ARCHITECTURE_MASK)
+
+#define MIDR_CPU_MODEL(imp, partnum) \
+ (((imp) << MIDR_IMPLEMENTER_SHIFT) | (0xfU << MIDR_ARCHITECTURE_SHIFT) | ((partnum) << MIDR_PARTNUM_SHIFT))
+
+#define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
+ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
#if defined(__ASSEMBLER__)
- /*
- * Support macros for
- * - Armv8.3-A Pointer Authentication and
- * - Armv8.5-A Branch Target Identification
- * features which require emitting a .note.gnu.property section with the
- * appropriate architecture-dependent feature bits set.
- * Read more: "ELF for the ArmĀ® 64-bit Architecture"
- */
-
-# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
-# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */
-# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */
-# else
-# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */
-# define AARCH64_VALID_CALL_TARGET
-# endif
-
-# if defined(__ARM_FEATURE_PAC_DEFAULT) && \
- (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */
-# define GNU_PROPERTY_AARCH64_POINTER_AUTH \
- (1 << 1) /* Has Pointer Authentication */
-# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */
-# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */
-# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
- (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */
-# define GNU_PROPERTY_AARCH64_POINTER_AUTH \
- (1 << 1) /* Has Pointer Authentication */
-# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */
-# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */
-# else
-# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */
-# if GNU_PROPERTY_AARCH64_BTI != 0
-# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
-# else
-# define AARCH64_SIGN_LINK_REGISTER
-# endif
-# define AARCH64_VALIDATE_LINK_REGISTER
-# endif
-
-# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
- .pushsection .note.gnu.property, "a";
- .balign 8;
- .long 4;
- .long 0x10;
- .long 0x5;
- .asciz "GNU";
- .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
- .long 4;
- .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
- .long 0;
- .popsection;
-# endif
-
-# endif /* defined __ASSEMBLER__ */
-
-# define IS_CPU_SUPPORT_UNROLL8_EOR3() \
- (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3)
+/*
+ * Support macros for
+ * - Armv8.3-A Pointer Authentication and
+ * - Armv8.5-A Branch Target Identification
+ * features which require emitting a .note.gnu.property section with the
+ * appropriate architecture-dependent feature bits set.
+ * Read more: "ELF for the ArmĀ® 64-bit Architecture"
+ */
+
+#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
+#define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */
+#define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */
+#else
+#define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */
+#define AARCH64_VALID_CALL_TARGET
+#endif
+
+#if defined(__ARM_FEATURE_PAC_DEFAULT) && (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+ (1 << 1) /* Has Pointer Authentication */
+#define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */
+#define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */
+#elif defined(__ARM_FEATURE_PAC_DEFAULT) && (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH \
+ (1 << 1) /* Has Pointer Authentication */
+#define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */
+#define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */
+#else
+#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */
+#if GNU_PROPERTY_AARCH64_BTI != 0
+#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
+#else
+#define AARCH64_SIGN_LINK_REGISTER
+#endif
+#define AARCH64_VALIDATE_LINK_REGISTER
+#endif
+
+#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
+/* clang-format off */
+.pushsection .note.gnu.property, "a";
+/* clang-format on */
+.balign 8;
+.long 4;
+.long 0x10;
+.long 0x5;
+.asciz "GNU";
+.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
+.long 4;
+.long(GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
+.long 0;
+.popsection;
+#endif
+
+#endif /* defined __ASSEMBLER__ */
+
+#define IS_CPU_SUPPORT_UNROLL8_EOR3() \
+ (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3)
#endif
diff --git a/sys/crypto/openssl/powerpc/aes-gcm-ppc.S b/sys/crypto/openssl/powerpc/aes-gcm-ppc.S
index 23a8feb24745..51cfac7e45fc 100644
--- a/sys/crypto/openssl/powerpc/aes-gcm-ppc.S
+++ b/sys/crypto/openssl/powerpc/aes-gcm-ppc.S
@@ -1,531 +1,587 @@
/* Do not modify. This file is auto-generated from aes-gcm-ppc.pl. */
-.machine "any"
+.machine "any"
.text
-
-
-
-
-.macro .Loop_aes_middle4x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- .long 0x11EF9D08
- .long 0x12109D08
- .long 0x12319D08
- .long 0x12529D08
-
- .long 0x11EFA508
- .long 0x1210A508
- .long 0x1231A508
- .long 0x1252A508
-
- .long 0x11EFAD08
- .long 0x1210AD08
- .long 0x1231AD08
- .long 0x1252AD08
-
- .long 0x11EFB508
- .long 0x1210B508
- .long 0x1231B508
- .long 0x1252B508
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- .long 0x11EF9D08
- .long 0x12109D08
- .long 0x12319D08
- .long 0x12529D08
-
- .long 0x11EFA508
- .long 0x1210A508
- .long 0x1231A508
- .long 0x1252A508
-
- .long 0x11EFAD08
- .long 0x1210AD08
- .long 0x1231AD08
- .long 0x1252AD08
-
- .long 0x11EFB508
- .long 0x1210B508
- .long 0x1231B508
- .long 0x1252B508
-
- xxlor 23+32, 9, 9
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
-.endm
-
-
-
-
-
-.macro .Loop_aes_middle8x
- xxlor 23+32, 1, 1
- xxlor 24+32, 2, 2
- xxlor 25+32, 3, 3
- xxlor 26+32, 4, 4
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- .long 0x11EFCD08
- .long 0x1210CD08
- .long 0x1231CD08
- .long 0x1252CD08
- .long 0x1273CD08
- .long 0x1294CD08
- .long 0x12B5CD08
- .long 0x12D6CD08
-
- .long 0x11EFD508
- .long 0x1210D508
- .long 0x1231D508
- .long 0x1252D508
- .long 0x1273D508
- .long 0x1294D508
- .long 0x12B5D508
- .long 0x12D6D508
-
- xxlor 23+32, 5, 5
- xxlor 24+32, 6, 6
- xxlor 25+32, 7, 7
- xxlor 26+32, 8, 8
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- .long 0x11EFCD08
- .long 0x1210CD08
- .long 0x1231CD08
- .long 0x1252CD08
- .long 0x1273CD08
- .long 0x1294CD08
- .long 0x12B5CD08
- .long 0x12D6CD08
-
- .long 0x11EFD508
- .long 0x1210D508
- .long 0x1231D508
- .long 0x1252D508
- .long 0x1273D508
- .long 0x1294D508
- .long 0x12B5D508
- .long 0x12D6D508
-
- xxlor 23+32, 9, 9
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-.endm
-
-
-
-
-ppc_aes_gcm_ghash:
- vxor 15, 15, 0
-
- xxlxor 29, 29, 29
-
- .long 0x12EC7CC8
- .long 0x130984C8
- .long 0x13268CC8
- .long 0x134394C8
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26
-
- .long 0x130D7CC8
- .long 0x132A84C8
- .long 0x13478CC8
- .long 0x136494C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
-
- .long 0x130E7CC8
- .long 0x132B84C8
- .long 0x13488CC8
- .long 0x136594C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32
-
- blr
-
-
-
-
-
-.macro ppc_aes_gcm_ghash2_4x
-
- vxor 15, 15, 0
-
- xxlxor 29, 29, 29
-
- .long 0x12EC7CC8
- .long 0x130984C8
- .long 0x13268CC8
- .long 0x134394C8
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-512(1)
+
+ std 14, 112(1)
+ std 15, 120(1)
+ std 16, 128(1)
+ std 17, 136(1)
+ std 18, 144(1)
+ std 19, 152(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ std 24, 192(1)
+
+ stxv 32+20, 256(1)
+ stxv 32+21, 256+16(1)
+ stxv 32+22, 256+32(1)
+ stxv 32+23, 256+48(1)
+ stxv 32+24, 256+64(1)
+ stxv 32+25, 256+80(1)
+ stxv 32+26, 256+96(1)
+ stxv 32+27, 256+112(1)
+ stxv 32+28, 256+128(1)
+ stxv 32+29, 256+144(1)
+ stxv 32+30, 256+160(1)
+ stxv 32+31, 256+176(1)
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ lxv 32+20, 256(1)
+ lxv 32+21, 256+16(1)
+ lxv 32+22, 256+32(1)
+ lxv 32+23, 256+48(1)
+ lxv 32+24, 256+64(1)
+ lxv 32+25, 256+80(1)
+ lxv 32+26, 256+96(1)
+ lxv 32+27, 256+112(1)
+ lxv 32+28, 256+128(1)
+ lxv 32+29, 256+144(1)
+ lxv 32+30, 256+160(1)
+ lxv 32+31, 256+176(1)
+
+ ld 14, 112(1)
+ ld 15, 120(1)
+ ld 16, 128(1)
+ ld 17, 136(1)
+ ld 18, 144(1)
+ ld 19, 152(1)
+ ld 20, 160(1)
+ ld 21, 168(1)
+ ld 22, 176(1)
+ ld 23, 184(1)
+ ld 24, 192(1)
+
+ addi 1, 1, 512
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x r
+ vcipher 15, 15, \r
+ vcipher 16, 16, \r
+ vcipher 17, 17, \r
+ vcipher 18, 18, \r
+.endm
+
+# 8x loops
+.macro AES_CIPHER_8x r
+ vcipher 15, 15, \r
+ vcipher 16, 16, \r
+ vcipher 17, 17, \r
+ vcipher 18, 18, \r
+ vcipher 19, 19, \r
+ vcipher 20, 20, \r
+ vcipher 21, 21, \r
+ vcipher 22, 22, \r
+.endm
+
+.macro LOOP_8AES_STATE
+ AES_CIPHER_8x 23
+ AES_CIPHER_8x 24
+ AES_CIPHER_8x 25
+ AES_CIPHER_8x 26
+ AES_CIPHER_8x 27
+ AES_CIPHER_8x 28
+ AES_CIPHER_8x 29
+ AES_CIPHER_8x 1
+.endm
+
+#
+# PPC_GFMUL128_8x: Compute hash values of 8 blocks based on Karatsuba method.
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# vs10: vpermxor vector
+# Scratch: v23 - v29
+#
+.macro PPC_GFMUL128_8x
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
- vxor 23, 23, 26
-
- .long 0x130D7CC8
- .long 0x132A84C8
- .long 0x13478CC8
- .long 0x136494C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
-
- vxor 24, 24, 27
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
+ vxor 23, 23, 26 # L
- .long 0x130E7CC8
- .long 0x132B84C8
- .long 0x13488CC8
- .long 0x136594C8
+ vpmsumd 27, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 25, 7, 17
+ vpmsumd 26, 4, 18
+ vxor 24, 27, 28
vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 27, 23, 27
-
-
- .long 0x1309A4C8
- .long 0x1326ACC8
- .long 0x1343B4C8
- vxor 19, 19, 27
- .long 0x12EC9CC8
+ vxor 24, 24, 26 # M
+
+ vpmsumd 26, 14, 15 # H4.H * X.H
+ vpmsumd 27, 11, 16
+ vpmsumd 28, 8, 17
+ vpmsumd 29, 5, 18
+
+ vxor 26, 26, 27
+ vxor 26, 26, 28
+ vxor 26, 26, 29
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+29, 10, 10
+ vpermxor 23, 23, 28, 29
+
+ vxor 24, 26, 1 # H
+
+ # sum hash and reduction with H Poly
+ #
+ # vsldoi 25, 23, 23, 8 # swap
+ # vpmsumd 23, 23, 2
+ # vxor 27, 25, 24
+ #
+ vpermxor 27, 23, 24, 29
+ vpmsumd 23, 23, 2
+ vxor 0, 23, 27 # Digest of 4 blocks
+
+ vxor 19, 19, 0
+
+ # Compute digest for the next 4 blocks
+ vpmsumd 24, 9, 20
+ vpmsumd 25, 6, 21
+ vpmsumd 26, 3, 22
+ vpmsumd 23, 12, 19 # H4.L * X.L
vxor 23, 23, 24
vxor 23, 23, 25
- vxor 23, 23, 26
+ vxor 23, 23, 26 # L
- .long 0x130D9CC8
- .long 0x132AA4C8
- .long 0x1347ACC8
- .long 0x1364B4C8
+ vpmsumd 27, 13, 19 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, 20 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 25, 7, 21
+ vpmsumd 26, 4, 22
+ vxor 24, 27, 28
vxor 24, 24, 25
- vxor 24, 24, 26
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
-
- vxor 24, 24, 27
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
-
- .long 0x130E9CC8
- .long 0x132BA4C8
- .long 0x1348ACC8
- .long 0x1365B4C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32
-
-.endm
-
-
-
-
-.macro ppc_update_hash_1x
- vxor 28, 28, 0
-
- vxor 19, 19, 19
-
- .long 0x12C3E4C8
- .long 0x12E4E4C8
- .long 0x1305E4C8
-
- .long 0x137614C8
-
- vsldoi 25, 23, 19, 8
- vsldoi 26, 19, 23, 8
- vxor 22, 22, 25
- vxor 24, 24, 26
-
- vsldoi 22, 22, 22, 8
- vxor 22, 22, 27
-
- vsldoi 20, 22, 22, 8
- .long 0x12D614C8
- vxor 20, 20, 24
- vxor 22, 22, 20
-
- vor 0,22,22
-
-.endm
-
-
-
-
-
-
-
-
-
-
-
-
-
-.global ppc_aes_gcm_encrypt
-.align 5
-ppc_aes_gcm_encrypt:
-_ppc_aes_gcm_encrypt:
-
- stdu 1,-512(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- std 0, 528(1)
-
-
- lxvb16x 32, 0, 8
-
+ vxor 24, 24, 26 # M
+
+ vpmsumd 26, 14, 19 # H4.H * X.H
+ vpmsumd 27, 11, 20
+ vpmsumd 28, 8, 21
+ vpmsumd 29, 5, 22
+
+ vxor 26, 26, 27
+ vxor 26, 26, 28
+ vxor 26, 26, 29
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+29, 10, 10
+ vpermxor 23, 23, 28, 29
+
+ vxor 24, 26, 1 # H
+
+ # sum hash and reduction with H Poly
+ #
+ # vsldoi 25, 23, 23, 8 # swap
+ # vpmsumd 23, 23, 2
+ # vxor 27, 25, 24
+ #
+ vpermxor 27, 23, 24, 29
+ vpmsumd 23, 23, 2
+ vxor 0, 23, 27 # Digest of 8 blocks
+.endm
+
+#
+# Compute update single ghash
+# vs10: vpermxor vector
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
+
+ vxor 1, 1, 1
+
+ vpmsumd 22, 3, \S1 # L
+ vpmsumd 23, 4, \S1 # M
+ vpmsumd 24, 5, \S1 # H
+
+ vpmsumd 27, 22, 2 # reduction
+
+ vsldoi 25, 23, 1, 8 # mL
+ vsldoi 26, 1, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
+
+ xxlor 32+25, 10, 10
+ vpermxor 22, 22, 27, 25
+
+ # vsldoi 23, 22, 22, 8 # swap
+ # vpmsumd 22, 22, 2 # reduction
+ # vxor 23, 23, 24
+ vpermxor 23, 22, 24, 25
+ vpmsumd 22, 22, 2 # reduction
+
+ vxor \H, 22, 23
+.endm
+
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
+.macro LOAD_HASH_TABLE
+ # Load Xi
+ lxvb16x 32, 0, 8 # load Xi
+
+ vxor 1, 1, 1
li 10, 32
- lxvd2x 2+32, 10, 8
- li 10, 48
- lxvd2x 3+32, 10, 8
- li 10, 64
- lxvd2x 4+32, 10, 8
- li 10, 80
- lxvd2x 5+32, 10, 8
+ lxvd2x 2+32, 10, 8 # H Poli
- li 10, 96
- lxvd2x 6+32, 10, 8
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 64
+ lxvd2x 4+32, 10, 8 # H
+ vsldoi 3, 1, 4, 8 # l
+ vsldoi 5, 4, 1, 8 # h
li 10, 112
- lxvd2x 7+32, 10, 8
- li 10, 128
- lxvd2x 8+32, 10, 8
-
- li 10, 144
- lxvd2x 9+32, 10, 8
+ lxvd2x 7+32, 10, 8 # H^2
+ vsldoi 6, 1, 7, 8 # l
+ vsldoi 8, 7, 1, 8 # h
li 10, 160
- lxvd2x 10+32, 10, 8
- li 10, 176
- lxvd2x 11+32, 10, 8
-
- li 10, 192
- lxvd2x 12+32, 10, 8
+ lxvd2x 10+32, 10, 8 # H^3
+ vsldoi 9, 1, 10, 8 # l
+ vsldoi 11, 10, 1, 8 # h
li 10, 208
- lxvd2x 13+32, 10, 8
- li 10, 224
- lxvd2x 14+32, 10, 8
-
-
- lxvb16x 30+32, 0, 7
-
- mr 12, 5
- li 11, 0
-
-
- vxor 31, 31, 31
- vspltisb 22,1
- vsldoi 31, 31, 22,1
-
-
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
-
- lwz 9,240(6)
+ lxvd2x 13+32, 10, 8 # H^4
+ vsldoi 12, 1, 13, 8 # l
+ vsldoi 14, 13, 1, 8 # h
+.endm
+
+.macro PROCESS_8X_AES_STATES
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+.endm
+
+.macro COMPUTE_STATES
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
+.endm
+
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+.align 4
+aes_gcm_crypt_1x:
+.localentry aes_gcm_crypt_1x,0
+
+ cmpdi 5, 16
+ bge __More_1x
+ blr
+__More_1x:
+ li 10, 16
+ divdu 12, 5, 10
+
+ xxlxor 32+15, 32+30, 0
+
+ # Pre-load 8 AES rounds to scratch vectors.
+ lxv 32+16, 16(6) # round key 1
+ lxv 32+17, 32(6) # round key 2
+ lxv 32+18, 48(6) # round key 3
+ lxv 32+19, 64(6) # round key 4
+ lxv 32+20, 80(6) # round key 5
+ lxv 32+21, 96(6) # round key 6
+ lxv 32+28, 112(6) # round key 7
+ lxv 32+29, 128(6) # round key 8
+
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -9 # remaining AES rounds
+ cmpdi 12, 0
+ bgt __Loop_1x
+ blr
+
+__Loop_1x:
+ mtctr 22
+ addi 10, 6, 144
+ vcipher 15, 15, 16
+ vcipher 15, 15, 17
+ vcipher 15, 15, 18
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 28
+ vcipher 15, 15, 29
+
+__Loop_aes_1state:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_1state
+ lxv 32+1, 0(10) # last round key
+ lxvb16x 11, 0, 14 # load input block
+ vcipherlast 15, 15, 1
- xxlor 32+29, 0, 0
- vxor 15, 30, 29
+ xxlxor 32+15, 32+15, 11
+ stxvb16x 32+15, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
- cmpdi 9, 10
- beq .Loop_aes_gcm_8x
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_1x
+ xxlor 15+32, 11, 11
+__Encrypt_1x:
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+ addi 5, 5, -16
+ addi 11, 11, 16
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
+ vadduwm 30, 30, 31 # IV + counter
+ xxlxor 32+15, 32+30, 0
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bgt __Loop_1x
+
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+.size aes_gcm_crypt_1x,.-aes_gcm_crypt_1x
+
+################################################################################
+# Process a normal partial block when we come here.
+# Compute partial mask, Load and store partial block to stack.
+# Compute AES state.
+# Compute ghash.
+#
+################################################################################
+.align 4
+__Process_partial:
+.localentry __Process_partial,0
+
+ # create partial mask
+ vspltisb 16, -1
+ li 12, 16
+ sub 12, 12, 5
+ sldi 12, 12, 3
+ mtvsrdd 32+17, 0, 12
+ vslo 16, 16, 17 # partial block mask
+
+ lxvb16x 11, 0, 14 # load partial block
+ xxland 11, 11, 32+16
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_pstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_pstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- cmpdi 9, 12
- beq .Loop_aes_gcm_8x
+ xxlxor 32+15, 32+15, 11
+ vand 15, 15, 16
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ addi 12, 9, -1
+ mtctr 5 # partial block len
+__Write_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(12)
+ bdnz __Write_partial
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_partial
+ xxlor 32+15, 11, 11 # decrypt using the input block
+__Encrypt_partial:
+ vxor 15, 15, 0 # ^ previous hash
+ PPC_GHASH1x 0, 15
+ li 5, 0 # done last byte
+ stxvb16x 32+0, 0, 8 # Update X1
+ blr
+.size __Process_partial,.-__Process_partial
+
+################################################################################
+# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+#
+# r3 - inp
+# r4 - out
+# r5 - len
+# r6 - AES round keys
+# r7 - iv
+# r8 - Xi, HPoli, hash keys
+#
+# rounds is at offset 240 in rk
+# Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+.global ppc_aes_gcm_encrypt
+.align 5
+ppc_aes_gcm_encrypt:
+.localentry ppc_aes_gcm_encrypt,0
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq .Loop_aes_gcm_8x
+ SAVE_REGS
+ LOAD_HASH_TABLE
- b aes_gcm_out
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
-.align 5
-.Loop_aes_gcm_8x:
mr 14, 3
mr 9, 4
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+ lxv 0, 0(6) # round key 0
+
+ #
+ # Process different blocks
+ #
+ cmpdi 5, 128
+ blt __Process_more_enc
+
+ # load 9 round keys
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+
+__Process_encrypt:
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+ # 8x blocks
li 10, 128
- divdu 10, 5, 10
- cmpdi 10, 0
- beq .Loop_last_block
-
- .long 0x13DEF8C0
- vxor 16, 30, 29
- .long 0x13DEF8C0
- vxor 17, 30, 29
- .long 0x13DEF8C0
- vxor 18, 30, 29
- .long 0x13DEF8C0
- vxor 19, 30, 29
- .long 0x13DEF8C0
- vxor 20, 30, 29
- .long 0x13DEF8C0
- vxor 21, 30, 29
- .long 0x13DEF8C0
- vxor 22, 30, 29
-
- mtctr 10
+ divdu 12, 5, 10 # n 128 bytes-blocks
+
+ addi 12, 12, -1 # loop - 1
+
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
li 15, 16
li 16, 32
@@ -535,523 +591,185 @@ _ppc_aes_gcm_encrypt:
li 20, 96
li 21, 112
- lwz 10, 240(6)
-
-.Loop_8x_block:
-
- lxvb16x 15, 0, 14
- lxvb16x 16, 15, 14
- lxvb16x 17, 16, 14
- lxvb16x 18, 17, 14
- lxvb16x 19, 18, 14
- lxvb16x 20, 19, 14
- lxvb16x 21, 20, 14
- lxvb16x 22, 21, 14
- addi 14, 14, 128
-
-.Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash
- b aes_gcm_out
-
-Do_next_ghash:
-
-
-
- .long 0x11EFBD09
- .long 0x1210BD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9
-
- .long 0x1231BD09
- .long 0x1252BD09
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9
-
- .long 0x1273BD09
- .long 0x1294BD09
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9
-
- .long 0x12B5BD09
- .long 0x12D6BD09
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9
-
- addi 9, 9, 128
-
-
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- .long 0x13DEF8C0
- vor 29,30,30
- vxor 15, 30, 27
- .long 0x13DEF8C0
- vxor 16, 30, 27
- .long 0x13DEF8C0
- vxor 17, 30, 27
- .long 0x13DEF8C0
- vxor 18, 30, 27
- .long 0x13DEF8C0
- vxor 19, 30, 27
- .long 0x13DEF8C0
- vxor 20, 30, 27
- .long 0x13DEF8C0
- vxor 21, 30, 27
- .long 0x13DEF8C0
- vxor 22, 30, 27
-
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz .Loop_8x_block
-
- vor 30,29,29
-
-.Loop_last_block:
- cmpdi 12, 0
- beq aes_gcm_out
-
-
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block
-
-.macro .Loop_aes_middle_1x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- .long 0x11EF9D08
- .long 0x11EFA508
- .long 0x11EFAD08
- .long 0x11EFB508
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- .long 0x11EF9D08
- .long 0x11EFA508
- .long 0x11EFAD08
- .long 0x11EFB508
-
- xxlor 19+32, 9, 9
- .long 0x11EF9D08
-.endm
-
-Next_rem_block:
- lxvb16x 15, 0, 14
-
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- cmpdi 10, 10
- beq Do_next_1x
+ LOOP_8AES_STATE # process 8 AES keys
+__PreLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x
-
-Do_next_1x:
- .long 0x11EFBD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- addi 14, 14, 16
- addi 9, 9, 16
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash
- vor 28,15,15
- ppc_update_hash_1x
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+ PROCESS_8X_AES_STATES
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- .long 0x13DEF8C0
- vxor 15, 30, 19
+ # Compute ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
+
+ COMPUTE_STATES
+
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # Compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+
+__LastLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state
- bdnz Next_rem_block
+ lxv 32+1, 0(10) # last round key (v1)
+ addi 12, 12, -1
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block:
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x
+ bne __Loop_8x_block_enc
+ #
+ # Remainng blocks
+ #
+__Finish_ghash:
+ PROCESS_8X_AES_STATES
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_final_1x
-
-Do_final_1x:
- .long 0x11EFBD09
-
- lxvb16x 15, 0, 14
- xxlxor 47, 47, 15
-
-
- li 15, 16
- sub 15, 15, 12
+ # Compute ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
- vspltisb 16,-1
- vspltisb 17,0
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
+ # Update IV and Xi
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+0, 0, 8 # update Xi
- addi 10, 1, 192
- lxvb16x 16, 15, 10
- xxland 47, 47, 16
+ addi 5, 5, -128
+ addi 11, 11, 128
- vor 28,15,15
- ppc_update_hash_1x
+ #
+ # Done 8x blocks
+ #
+ cmpdi 5, 0
+ beq aes_gcm_out
- bl Write_partial_block
+__Process_more_enc:
+ li 24, 1 # encrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
+ bl __Process_partial
b aes_gcm_out
+.size ppc_aes_gcm_encrypt,.-ppc_aes_gcm_encrypt
-
-
-
-
-
-Write_partial_block:
- li 10, 192
- stxvb16x 15+32, 10, 1
-
-
- addi 10, 9, -1
- addi 16, 1, 191
-
- mtctr 12
- li 15, 0
-
-Write_last_byte:
- lbzu 14, 1(16)
- stbu 14, 1(10)
- bdnz Write_last_byte
- blr
-
-aes_gcm_out:
-
- stxvb16x 32, 0, 8
- add 3, 11, 12
-
- li 9, 256
- lvx 20, 9, 1
- addi 9, 9, 16
- lvx 21, 9, 1
- addi 9, 9, 16
- lvx 22, 9, 1
- addi 9, 9, 16
- lvx 23, 9, 1
- addi 9, 9, 16
- lvx 24, 9, 1
- addi 9, 9, 16
- lvx 25, 9, 1
- addi 9, 9, 16
- lvx 26, 9, 1
- addi 9, 9, 16
- lvx 27, 9, 1
- addi 9, 9, 16
- lvx 28, 9, 1
- addi 9, 9, 16
- lvx 29, 9, 1
- addi 9, 9, 16
- lvx 30, 9, 1
- addi 9, 9, 16
- lvx 31, 9, 1
-
- ld 0, 528(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
-
- mtlr 0
- addi 1, 1, 512
- blr
-
-
-
-
-.global ppc_aes_gcm_decrypt
-.align 5
+################################################################################
+# ppc_aes_gcm_decrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+.global ppc_aes_gcm_decrypt
+.align 5
ppc_aes_gcm_decrypt:
-_ppc_aes_gcm_decrypt:
-
- stdu 1,-512(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- std 0, 528(1)
-
-
- lxvb16x 32, 0, 8
-
-
- li 10, 32
- lxvd2x 2+32, 10, 8
- li 10, 48
- lxvd2x 3+32, 10, 8
- li 10, 64
- lxvd2x 4+32, 10, 8
- li 10, 80
- lxvd2x 5+32, 10, 8
-
- li 10, 96
- lxvd2x 6+32, 10, 8
- li 10, 112
- lxvd2x 7+32, 10, 8
- li 10, 128
- lxvd2x 8+32, 10, 8
-
- li 10, 144
- lxvd2x 9+32, 10, 8
- li 10, 160
- lxvd2x 10+32, 10, 8
- li 10, 176
- lxvd2x 11+32, 10, 8
-
- li 10, 192
- lxvd2x 12+32, 10, 8
- li 10, 208
- lxvd2x 13+32, 10, 8
- li 10, 224
- lxvd2x 14+32, 10, 8
-
-
- lxvb16x 30+32, 0, 7
-
- mr 12, 5
- li 11, 0
-
+.localentry ppc_aes_gcm_decrypt, 0
- vxor 31, 31, 31
- vspltisb 22,1
- vsldoi 31, 31, 22,1
-
-
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
-
- lwz 9,240(6)
-
-
-
- xxlor 32+29, 0, 0
- vxor 15, 30, 29
-
- cmpdi 9, 10
- beq .Loop_aes_gcm_8x_dec
-
-
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq .Loop_aes_gcm_8x_dec
+ SAVE_REGS
+ LOAD_HASH_TABLE
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq .Loop_aes_gcm_8x_dec
-
- b aes_gcm_out
-
-.align 5
-.Loop_aes_gcm_8x_dec:
mr 14, 3
mr 9, 4
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+ lxv 0, 0(6) # round key 0
+
+ #
+ # Process different blocks
+ #
+ cmpdi 5, 128
+ blt __Process_more_dec
+
+ # load 9 round keys
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+
+__Process_decrypt:
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+ # 8x blocks
li 10, 128
- divdu 10, 5, 10
- cmpdi 10, 0
- beq .Loop_last_block_dec
-
- .long 0x13DEF8C0
- vxor 16, 30, 29
- .long 0x13DEF8C0
- vxor 17, 30, 29
- .long 0x13DEF8C0
- vxor 18, 30, 29
- .long 0x13DEF8C0
- vxor 19, 30, 29
- .long 0x13DEF8C0
- vxor 20, 30, 29
- .long 0x13DEF8C0
- vxor 21, 30, 29
- .long 0x13DEF8C0
- vxor 22, 30, 29
-
- mtctr 10
+ divdu 12, 5, 10 # n 128 bytes-blocks
+
+ addi 12, 12, -1 # loop - 1
+
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
li 15, 16
li 16, 32
@@ -1061,279 +779,215 @@ _ppc_aes_gcm_decrypt:
li 20, 96
li 21, 112
- lwz 10, 240(6)
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
-.Loop_8x_block_dec:
+ LOOP_8AES_STATE # process 8 AES keys
- lxvb16x 15, 0, 14
- lxvb16x 16, 15, 14
- lxvb16x 17, 16, 14
- lxvb16x 18, 17, 14
- lxvb16x 19, 18, 14
- lxvb16x 20, 19, 14
- lxvb16x 21, 20, 14
- lxvb16x 22, 21, 14
+__PreLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash_dec
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
addi 14, 14, 128
-.Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_last_aes_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_last_aes_dec
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_last_aes_dec
- b aes_gcm_out
-
-Do_last_aes_dec:
-
-
-
- .long 0x11EFBD09
- .long 0x1210BD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9
-
- .long 0x1231BD09
- .long 0x1252BD09
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9
-
- .long 0x1273BD09
- .long 0x1294BD09
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9
-
- .long 0x12B5BD09
- .long 0x12D6BD09
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
- xxlor 15+32, 15, 15
- xxlor 16+32, 16, 16
- xxlor 17+32, 17, 17
- xxlor 18+32, 18, 18
- xxlor 19+32, 19, 19
- xxlor 20+32, 20, 20
- xxlor 21+32, 21, 21
- xxlor 22+32, 22, 22
-
-
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- .long 0x13DEF8C0
- vor 29,30,30
- vxor 15, 30, 27
- .long 0x13DEF8C0
- vxor 16, 30, 27
- .long 0x13DEF8C0
- vxor 17, 30, 27
- .long 0x13DEF8C0
- vxor 18, 30, 27
- .long 0x13DEF8C0
- vxor 19, 30, 27
- .long 0x13DEF8C0
- vxor 20, 30, 27
- .long 0x13DEF8C0
- vxor 21, 30, 27
- .long 0x13DEF8C0
- vxor 22, 30, 27
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz .Loop_8x_block_dec
-
- vor 30,29,29
-
-.Loop_last_block_dec:
- cmpdi 12, 0
- beq aes_gcm_out
-
-
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10,240(6)
-
- cmpdi 12, 16
- blt Final_block_dec
-
-Next_rem_block_dec:
- lxvb16x 15, 0, 14
-
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_1x_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x_dec
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x_dec
-
-Do_next_1x_dec:
- .long 0x11EFBD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- addi 14, 14, 16
- addi 9, 9, 16
-
- xxlor 28+32, 15, 15
- ppc_update_hash_1x
+ vmr 15, 23
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- .long 0x13DEF8C0
- vxor 15, 30, 19
+ # ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
- bdnz Next_rem_block_dec
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+ addi 12, 12, -1
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block_dec:
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x_dec
-
+ bne __Loop_8x_block_dec
+
+__Finish_ghash_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
- xxlor 24+32, 13, 13
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
- .long 0x11EFBD08
- .long 0x11EFC508
+ vxor 15, 23, 0
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- xxlor 23+32, 14, 14
+ #vxor 15, 15, 0
+ PPC_GFMUL128_8x
- cmpdi 10, 14
- beq Do_final_1x_dec
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+0, 0, 8 # update Xi
-Do_final_1x_dec:
- .long 0x11EFBD09
+ addi 5, 5, -128
+ addi 11, 11, 128
- lxvb16x 15, 0, 14
- xxlxor 47, 47, 15
+ #
+ # Done 8x blocks
+ #
+ cmpdi 5, 0
+ beq aes_gcm_out
- li 15, 16
- sub 15, 15, 12
+__Process_more_dec:
+ li 24, 0 # decrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- vspltisb 16,-1
- vspltisb 17,0
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10
- xxland 47, 47, 16
+ bl __Process_partial
+ b aes_gcm_out
+.size ppc_aes_gcm_decrypt,.-ppc_aes_gcm_decrypt
- xxlor 28+32, 15, 15
- ppc_update_hash_1x
+aes_gcm_out:
+.localentry aes_gcm_out,0
+ mr 3, 11 # return count
- bl Write_partial_block
+ RESTORE_REGS
+ blr
+.size aes_gcm_out,.-aes_gcm_out
- b aes_gcm_out
+.rodata
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
diff --git a/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S b/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S
index 2ff143c42ab7..51cfac7e45fc 100644
--- a/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S
+++ b/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S
@@ -1,532 +1,587 @@
/* Do not modify. This file is auto-generated from aes-gcm-ppc.pl. */
-.machine "any"
-.abiversion 2
+.machine "any"
.text
-
-
-
-
-.macro .Loop_aes_middle4x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- .long 0x11EF9D08
- .long 0x12109D08
- .long 0x12319D08
- .long 0x12529D08
-
- .long 0x11EFA508
- .long 0x1210A508
- .long 0x1231A508
- .long 0x1252A508
-
- .long 0x11EFAD08
- .long 0x1210AD08
- .long 0x1231AD08
- .long 0x1252AD08
-
- .long 0x11EFB508
- .long 0x1210B508
- .long 0x1231B508
- .long 0x1252B508
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- .long 0x11EF9D08
- .long 0x12109D08
- .long 0x12319D08
- .long 0x12529D08
-
- .long 0x11EFA508
- .long 0x1210A508
- .long 0x1231A508
- .long 0x1252A508
-
- .long 0x11EFAD08
- .long 0x1210AD08
- .long 0x1231AD08
- .long 0x1252AD08
-
- .long 0x11EFB508
- .long 0x1210B508
- .long 0x1231B508
- .long 0x1252B508
-
- xxlor 23+32, 9, 9
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
-.endm
-
-
-
-
-
-.macro .Loop_aes_middle8x
- xxlor 23+32, 1, 1
- xxlor 24+32, 2, 2
- xxlor 25+32, 3, 3
- xxlor 26+32, 4, 4
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- .long 0x11EFCD08
- .long 0x1210CD08
- .long 0x1231CD08
- .long 0x1252CD08
- .long 0x1273CD08
- .long 0x1294CD08
- .long 0x12B5CD08
- .long 0x12D6CD08
-
- .long 0x11EFD508
- .long 0x1210D508
- .long 0x1231D508
- .long 0x1252D508
- .long 0x1273D508
- .long 0x1294D508
- .long 0x12B5D508
- .long 0x12D6D508
-
- xxlor 23+32, 5, 5
- xxlor 24+32, 6, 6
- xxlor 25+32, 7, 7
- xxlor 26+32, 8, 8
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- .long 0x11EFCD08
- .long 0x1210CD08
- .long 0x1231CD08
- .long 0x1252CD08
- .long 0x1273CD08
- .long 0x1294CD08
- .long 0x12B5CD08
- .long 0x12D6CD08
-
- .long 0x11EFD508
- .long 0x1210D508
- .long 0x1231D508
- .long 0x1252D508
- .long 0x1273D508
- .long 0x1294D508
- .long 0x12B5D508
- .long 0x12D6D508
-
- xxlor 23+32, 9, 9
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-.endm
-
-
-
-
-ppc_aes_gcm_ghash:
- vxor 15, 15, 0
-
- xxlxor 29, 29, 29
-
- .long 0x12EC7CC8
- .long 0x130984C8
- .long 0x13268CC8
- .long 0x134394C8
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26
-
- .long 0x130D7CC8
- .long 0x132A84C8
- .long 0x13478CC8
- .long 0x136494C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
-
- .long 0x130E7CC8
- .long 0x132B84C8
- .long 0x13488CC8
- .long 0x136594C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32
-
- blr
-
-
-
-
-
-.macro ppc_aes_gcm_ghash2_4x
-
- vxor 15, 15, 0
-
- xxlxor 29, 29, 29
-
- .long 0x12EC7CC8
- .long 0x130984C8
- .long 0x13268CC8
- .long 0x134394C8
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-512(1)
+
+ std 14, 112(1)
+ std 15, 120(1)
+ std 16, 128(1)
+ std 17, 136(1)
+ std 18, 144(1)
+ std 19, 152(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ std 24, 192(1)
+
+ stxv 32+20, 256(1)
+ stxv 32+21, 256+16(1)
+ stxv 32+22, 256+32(1)
+ stxv 32+23, 256+48(1)
+ stxv 32+24, 256+64(1)
+ stxv 32+25, 256+80(1)
+ stxv 32+26, 256+96(1)
+ stxv 32+27, 256+112(1)
+ stxv 32+28, 256+128(1)
+ stxv 32+29, 256+144(1)
+ stxv 32+30, 256+160(1)
+ stxv 32+31, 256+176(1)
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ lxv 32+20, 256(1)
+ lxv 32+21, 256+16(1)
+ lxv 32+22, 256+32(1)
+ lxv 32+23, 256+48(1)
+ lxv 32+24, 256+64(1)
+ lxv 32+25, 256+80(1)
+ lxv 32+26, 256+96(1)
+ lxv 32+27, 256+112(1)
+ lxv 32+28, 256+128(1)
+ lxv 32+29, 256+144(1)
+ lxv 32+30, 256+160(1)
+ lxv 32+31, 256+176(1)
+
+ ld 14, 112(1)
+ ld 15, 120(1)
+ ld 16, 128(1)
+ ld 17, 136(1)
+ ld 18, 144(1)
+ ld 19, 152(1)
+ ld 20, 160(1)
+ ld 21, 168(1)
+ ld 22, 176(1)
+ ld 23, 184(1)
+ ld 24, 192(1)
+
+ addi 1, 1, 512
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x r
+ vcipher 15, 15, \r
+ vcipher 16, 16, \r
+ vcipher 17, 17, \r
+ vcipher 18, 18, \r
+.endm
+
+# 8x loops
+.macro AES_CIPHER_8x r
+ vcipher 15, 15, \r
+ vcipher 16, 16, \r
+ vcipher 17, 17, \r
+ vcipher 18, 18, \r
+ vcipher 19, 19, \r
+ vcipher 20, 20, \r
+ vcipher 21, 21, \r
+ vcipher 22, 22, \r
+.endm
+
+.macro LOOP_8AES_STATE
+ AES_CIPHER_8x 23
+ AES_CIPHER_8x 24
+ AES_CIPHER_8x 25
+ AES_CIPHER_8x 26
+ AES_CIPHER_8x 27
+ AES_CIPHER_8x 28
+ AES_CIPHER_8x 29
+ AES_CIPHER_8x 1
+.endm
+
+#
+# PPC_GFMUL128_8x: Compute hash values of 8 blocks based on Karatsuba method.
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# vs10: vpermxor vector
+# Scratch: v23 - v29
+#
+.macro PPC_GFMUL128_8x
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
- vxor 23, 23, 26
-
- .long 0x130D7CC8
- .long 0x132A84C8
- .long 0x13478CC8
- .long 0x136494C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
-
- vxor 24, 24, 27
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
+ vxor 23, 23, 26 # L
- .long 0x130E7CC8
- .long 0x132B84C8
- .long 0x13488CC8
- .long 0x136594C8
+ vpmsumd 27, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 25, 7, 17
+ vpmsumd 26, 4, 18
+ vxor 24, 27, 28
vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 27, 23, 27
-
-
- .long 0x1309A4C8
- .long 0x1326ACC8
- .long 0x1343B4C8
- vxor 19, 19, 27
- .long 0x12EC9CC8
+ vxor 24, 24, 26 # M
+
+ vpmsumd 26, 14, 15 # H4.H * X.H
+ vpmsumd 27, 11, 16
+ vpmsumd 28, 8, 17
+ vpmsumd 29, 5, 18
+
+ vxor 26, 26, 27
+ vxor 26, 26, 28
+ vxor 26, 26, 29
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+29, 10, 10
+ vpermxor 23, 23, 28, 29
+
+ vxor 24, 26, 1 # H
+
+ # sum hash and reduction with H Poly
+ #
+ # vsldoi 25, 23, 23, 8 # swap
+ # vpmsumd 23, 23, 2
+ # vxor 27, 25, 24
+ #
+ vpermxor 27, 23, 24, 29
+ vpmsumd 23, 23, 2
+ vxor 0, 23, 27 # Digest of 4 blocks
+
+ vxor 19, 19, 0
+
+ # Compute digest for the next 4 blocks
+ vpmsumd 24, 9, 20
+ vpmsumd 25, 6, 21
+ vpmsumd 26, 3, 22
+ vpmsumd 23, 12, 19 # H4.L * X.L
vxor 23, 23, 24
vxor 23, 23, 25
- vxor 23, 23, 26
+ vxor 23, 23, 26 # L
- .long 0x130D9CC8
- .long 0x132AA4C8
- .long 0x1347ACC8
- .long 0x1364B4C8
+ vpmsumd 27, 13, 19 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, 20 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 25, 7, 21
+ vpmsumd 26, 4, 22
+ vxor 24, 27, 28
vxor 24, 24, 25
- vxor 24, 24, 26
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
-
- vxor 24, 24, 27
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
-
- .long 0x130E9CC8
- .long 0x132BA4C8
- .long 0x1348ACC8
- .long 0x1365B4C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32
-
-.endm
-
-
-
-
-.macro ppc_update_hash_1x
- vxor 28, 28, 0
-
- vxor 19, 19, 19
-
- .long 0x12C3E4C8
- .long 0x12E4E4C8
- .long 0x1305E4C8
-
- .long 0x137614C8
-
- vsldoi 25, 23, 19, 8
- vsldoi 26, 19, 23, 8
- vxor 22, 22, 25
- vxor 24, 24, 26
-
- vsldoi 22, 22, 22, 8
- vxor 22, 22, 27
-
- vsldoi 20, 22, 22, 8
- .long 0x12D614C8
- vxor 20, 20, 24
- vxor 22, 22, 20
-
- vor 0,22,22
-
-.endm
-
-
-
-
-
-
-
-
-
-
-
-
-
-.global ppc_aes_gcm_encrypt
-.align 5
-ppc_aes_gcm_encrypt:
-_ppc_aes_gcm_encrypt:
-
- stdu 1,-512(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- std 0, 528(1)
-
-
- lxvb16x 32, 0, 8
-
+ vxor 24, 24, 26 # M
+
+ vpmsumd 26, 14, 19 # H4.H * X.H
+ vpmsumd 27, 11, 20
+ vpmsumd 28, 8, 21
+ vpmsumd 29, 5, 22
+
+ vxor 26, 26, 27
+ vxor 26, 26, 28
+ vxor 26, 26, 29
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+29, 10, 10
+ vpermxor 23, 23, 28, 29
+
+ vxor 24, 26, 1 # H
+
+ # sum hash and reduction with H Poly
+ #
+ # vsldoi 25, 23, 23, 8 # swap
+ # vpmsumd 23, 23, 2
+ # vxor 27, 25, 24
+ #
+ vpermxor 27, 23, 24, 29
+ vpmsumd 23, 23, 2
+ vxor 0, 23, 27 # Digest of 8 blocks
+.endm
+
+#
+# Compute update single ghash
+# vs10: vpermxor vector
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
+
+ vxor 1, 1, 1
+
+ vpmsumd 22, 3, \S1 # L
+ vpmsumd 23, 4, \S1 # M
+ vpmsumd 24, 5, \S1 # H
+
+ vpmsumd 27, 22, 2 # reduction
+
+ vsldoi 25, 23, 1, 8 # mL
+ vsldoi 26, 1, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
+
+ xxlor 32+25, 10, 10
+ vpermxor 22, 22, 27, 25
+
+ # vsldoi 23, 22, 22, 8 # swap
+ # vpmsumd 22, 22, 2 # reduction
+ # vxor 23, 23, 24
+ vpermxor 23, 22, 24, 25
+ vpmsumd 22, 22, 2 # reduction
+
+ vxor \H, 22, 23
+.endm
+
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
+.macro LOAD_HASH_TABLE
+ # Load Xi
+ lxvb16x 32, 0, 8 # load Xi
+
+ vxor 1, 1, 1
li 10, 32
- lxvd2x 2+32, 10, 8
- li 10, 48
- lxvd2x 3+32, 10, 8
- li 10, 64
- lxvd2x 4+32, 10, 8
- li 10, 80
- lxvd2x 5+32, 10, 8
+ lxvd2x 2+32, 10, 8 # H Poli
- li 10, 96
- lxvd2x 6+32, 10, 8
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 64
+ lxvd2x 4+32, 10, 8 # H
+ vsldoi 3, 1, 4, 8 # l
+ vsldoi 5, 4, 1, 8 # h
li 10, 112
- lxvd2x 7+32, 10, 8
- li 10, 128
- lxvd2x 8+32, 10, 8
-
- li 10, 144
- lxvd2x 9+32, 10, 8
+ lxvd2x 7+32, 10, 8 # H^2
+ vsldoi 6, 1, 7, 8 # l
+ vsldoi 8, 7, 1, 8 # h
li 10, 160
- lxvd2x 10+32, 10, 8
- li 10, 176
- lxvd2x 11+32, 10, 8
-
- li 10, 192
- lxvd2x 12+32, 10, 8
+ lxvd2x 10+32, 10, 8 # H^3
+ vsldoi 9, 1, 10, 8 # l
+ vsldoi 11, 10, 1, 8 # h
li 10, 208
- lxvd2x 13+32, 10, 8
- li 10, 224
- lxvd2x 14+32, 10, 8
-
-
- lxvb16x 30+32, 0, 7
-
- mr 12, 5
- li 11, 0
-
-
- vxor 31, 31, 31
- vspltisb 22,1
- vsldoi 31, 31, 22,1
-
-
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
-
- lwz 9,240(6)
+ lxvd2x 13+32, 10, 8 # H^4
+ vsldoi 12, 1, 13, 8 # l
+ vsldoi 14, 13, 1, 8 # h
+.endm
+
+.macro PROCESS_8X_AES_STATES
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+.endm
+
+.macro COMPUTE_STATES
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
+.endm
+
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+.align 4
+aes_gcm_crypt_1x:
+.localentry aes_gcm_crypt_1x,0
+
+ cmpdi 5, 16
+ bge __More_1x
+ blr
+__More_1x:
+ li 10, 16
+ divdu 12, 5, 10
+
+ xxlxor 32+15, 32+30, 0
+
+ # Pre-load 8 AES rounds to scratch vectors.
+ lxv 32+16, 16(6) # round key 1
+ lxv 32+17, 32(6) # round key 2
+ lxv 32+18, 48(6) # round key 3
+ lxv 32+19, 64(6) # round key 4
+ lxv 32+20, 80(6) # round key 5
+ lxv 32+21, 96(6) # round key 6
+ lxv 32+28, 112(6) # round key 7
+ lxv 32+29, 128(6) # round key 8
+
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -9 # remaining AES rounds
+ cmpdi 12, 0
+ bgt __Loop_1x
+ blr
+
+__Loop_1x:
+ mtctr 22
+ addi 10, 6, 144
+ vcipher 15, 15, 16
+ vcipher 15, 15, 17
+ vcipher 15, 15, 18
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 28
+ vcipher 15, 15, 29
+
+__Loop_aes_1state:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_1state
+ lxv 32+1, 0(10) # last round key
+ lxvb16x 11, 0, 14 # load input block
+ vcipherlast 15, 15, 1
- xxlor 32+29, 0, 0
- vxor 15, 30, 29
+ xxlxor 32+15, 32+15, 11
+ stxvb16x 32+15, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
- cmpdi 9, 10
- beq .Loop_aes_gcm_8x
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_1x
+ xxlor 15+32, 11, 11
+__Encrypt_1x:
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+ addi 5, 5, -16
+ addi 11, 11, 16
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
+ vadduwm 30, 30, 31 # IV + counter
+ xxlxor 32+15, 32+30, 0
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bgt __Loop_1x
+
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+.size aes_gcm_crypt_1x,.-aes_gcm_crypt_1x
+
+################################################################################
+# Process a normal partial block when we come here.
+# Compute partial mask, Load and store partial block to stack.
+# Compute AES state.
+# Compute ghash.
+#
+################################################################################
+.align 4
+__Process_partial:
+.localentry __Process_partial,0
+
+ # create partial mask
+ vspltisb 16, -1
+ li 12, 16
+ sub 12, 12, 5
+ sldi 12, 12, 3
+ mtvsrdd 32+17, 0, 12
+ vslo 16, 16, 17 # partial block mask
+
+ lxvb16x 11, 0, 14 # load partial block
+ xxland 11, 11, 32+16
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_pstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_pstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- cmpdi 9, 12
- beq .Loop_aes_gcm_8x
+ xxlxor 32+15, 32+15, 11
+ vand 15, 15, 16
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ addi 12, 9, -1
+ mtctr 5 # partial block len
+__Write_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(12)
+ bdnz __Write_partial
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_partial
+ xxlor 32+15, 11, 11 # decrypt using the input block
+__Encrypt_partial:
+ vxor 15, 15, 0 # ^ previous hash
+ PPC_GHASH1x 0, 15
+ li 5, 0 # done last byte
+ stxvb16x 32+0, 0, 8 # Update X1
+ blr
+.size __Process_partial,.-__Process_partial
+
+################################################################################
+# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+#
+# r3 - inp
+# r4 - out
+# r5 - len
+# r6 - AES round keys
+# r7 - iv
+# r8 - Xi, HPoli, hash keys
+#
+# rounds is at offset 240 in rk
+# Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+.global ppc_aes_gcm_encrypt
+.align 5
+ppc_aes_gcm_encrypt:
+.localentry ppc_aes_gcm_encrypt,0
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq .Loop_aes_gcm_8x
+ SAVE_REGS
+ LOAD_HASH_TABLE
- b aes_gcm_out
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
-.align 5
-.Loop_aes_gcm_8x:
mr 14, 3
mr 9, 4
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+ lxv 0, 0(6) # round key 0
+
+ #
+ # Process different blocks
+ #
+ cmpdi 5, 128
+ blt __Process_more_enc
+
+ # load 9 round keys
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+
+__Process_encrypt:
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+ # 8x blocks
li 10, 128
- divdu 10, 5, 10
- cmpdi 10, 0
- beq .Loop_last_block
-
- .long 0x13DEF8C0
- vxor 16, 30, 29
- .long 0x13DEF8C0
- vxor 17, 30, 29
- .long 0x13DEF8C0
- vxor 18, 30, 29
- .long 0x13DEF8C0
- vxor 19, 30, 29
- .long 0x13DEF8C0
- vxor 20, 30, 29
- .long 0x13DEF8C0
- vxor 21, 30, 29
- .long 0x13DEF8C0
- vxor 22, 30, 29
-
- mtctr 10
+ divdu 12, 5, 10 # n 128 bytes-blocks
+
+ addi 12, 12, -1 # loop - 1
+
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
li 15, 16
li 16, 32
@@ -536,523 +591,185 @@ _ppc_aes_gcm_encrypt:
li 20, 96
li 21, 112
- lwz 10, 240(6)
-
-.Loop_8x_block:
-
- lxvb16x 15, 0, 14
- lxvb16x 16, 15, 14
- lxvb16x 17, 16, 14
- lxvb16x 18, 17, 14
- lxvb16x 19, 18, 14
- lxvb16x 20, 19, 14
- lxvb16x 21, 20, 14
- lxvb16x 22, 21, 14
- addi 14, 14, 128
-
-.Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash
- b aes_gcm_out
-
-Do_next_ghash:
-
-
-
- .long 0x11EFBD09
- .long 0x1210BD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9
-
- .long 0x1231BD09
- .long 0x1252BD09
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9
-
- .long 0x1273BD09
- .long 0x1294BD09
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9
-
- .long 0x12B5BD09
- .long 0x12D6BD09
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9
-
- addi 9, 9, 128
-
-
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- .long 0x13DEF8C0
- vor 29,30,30
- vxor 15, 30, 27
- .long 0x13DEF8C0
- vxor 16, 30, 27
- .long 0x13DEF8C0
- vxor 17, 30, 27
- .long 0x13DEF8C0
- vxor 18, 30, 27
- .long 0x13DEF8C0
- vxor 19, 30, 27
- .long 0x13DEF8C0
- vxor 20, 30, 27
- .long 0x13DEF8C0
- vxor 21, 30, 27
- .long 0x13DEF8C0
- vxor 22, 30, 27
-
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz .Loop_8x_block
-
- vor 30,29,29
-
-.Loop_last_block:
- cmpdi 12, 0
- beq aes_gcm_out
-
-
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block
-
-.macro .Loop_aes_middle_1x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- .long 0x11EF9D08
- .long 0x11EFA508
- .long 0x11EFAD08
- .long 0x11EFB508
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- .long 0x11EF9D08
- .long 0x11EFA508
- .long 0x11EFAD08
- .long 0x11EFB508
-
- xxlor 19+32, 9, 9
- .long 0x11EF9D08
-.endm
-
-Next_rem_block:
- lxvb16x 15, 0, 14
-
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- cmpdi 10, 10
- beq Do_next_1x
+ LOOP_8AES_STATE # process 8 AES keys
+__PreLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x
-
-Do_next_1x:
- .long 0x11EFBD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- addi 14, 14, 16
- addi 9, 9, 16
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash
- vor 28,15,15
- ppc_update_hash_1x
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+ PROCESS_8X_AES_STATES
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- .long 0x13DEF8C0
- vxor 15, 30, 19
+ # Compute ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
+
+ COMPUTE_STATES
+
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # Compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+
+__LastLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state
- bdnz Next_rem_block
+ lxv 32+1, 0(10) # last round key (v1)
+ addi 12, 12, -1
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block:
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x
+ bne __Loop_8x_block_enc
+ #
+ # Remainng blocks
+ #
+__Finish_ghash:
+ PROCESS_8X_AES_STATES
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_final_1x
-
-Do_final_1x:
- .long 0x11EFBD09
-
- lxvb16x 15, 0, 14
- xxlxor 47, 47, 15
-
-
- li 15, 16
- sub 15, 15, 12
+ # Compute ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
- vspltisb 16,-1
- vspltisb 17,0
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
+ # Update IV and Xi
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+0, 0, 8 # update Xi
- addi 10, 1, 192
- lxvb16x 16, 15, 10
- xxland 47, 47, 16
+ addi 5, 5, -128
+ addi 11, 11, 128
- vor 28,15,15
- ppc_update_hash_1x
+ #
+ # Done 8x blocks
+ #
+ cmpdi 5, 0
+ beq aes_gcm_out
- bl Write_partial_block
+__Process_more_enc:
+ li 24, 1 # encrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
+ bl __Process_partial
b aes_gcm_out
+.size ppc_aes_gcm_encrypt,.-ppc_aes_gcm_encrypt
-
-
-
-
-
-Write_partial_block:
- li 10, 192
- stxvb16x 15+32, 10, 1
-
-
- addi 10, 9, -1
- addi 16, 1, 191
-
- mtctr 12
- li 15, 0
-
-Write_last_byte:
- lbzu 14, 1(16)
- stbu 14, 1(10)
- bdnz Write_last_byte
- blr
-
-aes_gcm_out:
-
- stxvb16x 32, 0, 8
- add 3, 11, 12
-
- li 9, 256
- lvx 20, 9, 1
- addi 9, 9, 16
- lvx 21, 9, 1
- addi 9, 9, 16
- lvx 22, 9, 1
- addi 9, 9, 16
- lvx 23, 9, 1
- addi 9, 9, 16
- lvx 24, 9, 1
- addi 9, 9, 16
- lvx 25, 9, 1
- addi 9, 9, 16
- lvx 26, 9, 1
- addi 9, 9, 16
- lvx 27, 9, 1
- addi 9, 9, 16
- lvx 28, 9, 1
- addi 9, 9, 16
- lvx 29, 9, 1
- addi 9, 9, 16
- lvx 30, 9, 1
- addi 9, 9, 16
- lvx 31, 9, 1
-
- ld 0, 528(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
-
- mtlr 0
- addi 1, 1, 512
- blr
-
-
-
-
-.global ppc_aes_gcm_decrypt
-.align 5
+################################################################################
+# ppc_aes_gcm_decrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+.global ppc_aes_gcm_decrypt
+.align 5
ppc_aes_gcm_decrypt:
-_ppc_aes_gcm_decrypt:
-
- stdu 1,-512(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- std 0, 528(1)
-
-
- lxvb16x 32, 0, 8
-
-
- li 10, 32
- lxvd2x 2+32, 10, 8
- li 10, 48
- lxvd2x 3+32, 10, 8
- li 10, 64
- lxvd2x 4+32, 10, 8
- li 10, 80
- lxvd2x 5+32, 10, 8
-
- li 10, 96
- lxvd2x 6+32, 10, 8
- li 10, 112
- lxvd2x 7+32, 10, 8
- li 10, 128
- lxvd2x 8+32, 10, 8
-
- li 10, 144
- lxvd2x 9+32, 10, 8
- li 10, 160
- lxvd2x 10+32, 10, 8
- li 10, 176
- lxvd2x 11+32, 10, 8
-
- li 10, 192
- lxvd2x 12+32, 10, 8
- li 10, 208
- lxvd2x 13+32, 10, 8
- li 10, 224
- lxvd2x 14+32, 10, 8
-
-
- lxvb16x 30+32, 0, 7
-
- mr 12, 5
- li 11, 0
-
+.localentry ppc_aes_gcm_decrypt, 0
- vxor 31, 31, 31
- vspltisb 22,1
- vsldoi 31, 31, 22,1
-
-
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
-
- lwz 9,240(6)
-
-
-
- xxlor 32+29, 0, 0
- vxor 15, 30, 29
-
- cmpdi 9, 10
- beq .Loop_aes_gcm_8x_dec
-
-
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq .Loop_aes_gcm_8x_dec
+ SAVE_REGS
+ LOAD_HASH_TABLE
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq .Loop_aes_gcm_8x_dec
-
- b aes_gcm_out
-
-.align 5
-.Loop_aes_gcm_8x_dec:
mr 14, 3
mr 9, 4
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+ lxv 0, 0(6) # round key 0
+
+ #
+ # Process different blocks
+ #
+ cmpdi 5, 128
+ blt __Process_more_dec
+
+ # load 9 round keys
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+
+__Process_decrypt:
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+ # 8x blocks
li 10, 128
- divdu 10, 5, 10
- cmpdi 10, 0
- beq .Loop_last_block_dec
-
- .long 0x13DEF8C0
- vxor 16, 30, 29
- .long 0x13DEF8C0
- vxor 17, 30, 29
- .long 0x13DEF8C0
- vxor 18, 30, 29
- .long 0x13DEF8C0
- vxor 19, 30, 29
- .long 0x13DEF8C0
- vxor 20, 30, 29
- .long 0x13DEF8C0
- vxor 21, 30, 29
- .long 0x13DEF8C0
- vxor 22, 30, 29
-
- mtctr 10
+ divdu 12, 5, 10 # n 128 bytes-blocks
+
+ addi 12, 12, -1 # loop - 1
+
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
li 15, 16
li 16, 32
@@ -1062,279 +779,215 @@ _ppc_aes_gcm_decrypt:
li 20, 96
li 21, 112
- lwz 10, 240(6)
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
-.Loop_8x_block_dec:
+ LOOP_8AES_STATE # process 8 AES keys
- lxvb16x 15, 0, 14
- lxvb16x 16, 15, 14
- lxvb16x 17, 16, 14
- lxvb16x 18, 17, 14
- lxvb16x 19, 18, 14
- lxvb16x 20, 19, 14
- lxvb16x 21, 20, 14
- lxvb16x 22, 21, 14
+__PreLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash_dec
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
addi 14, 14, 128
-.Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_last_aes_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_last_aes_dec
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_last_aes_dec
- b aes_gcm_out
-
-Do_last_aes_dec:
-
-
-
- .long 0x11EFBD09
- .long 0x1210BD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9
-
- .long 0x1231BD09
- .long 0x1252BD09
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9
-
- .long 0x1273BD09
- .long 0x1294BD09
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9
-
- .long 0x12B5BD09
- .long 0x12D6BD09
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
- xxlor 15+32, 15, 15
- xxlor 16+32, 16, 16
- xxlor 17+32, 17, 17
- xxlor 18+32, 18, 18
- xxlor 19+32, 19, 19
- xxlor 20+32, 20, 20
- xxlor 21+32, 21, 21
- xxlor 22+32, 22, 22
-
-
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- .long 0x13DEF8C0
- vor 29,30,30
- vxor 15, 30, 27
- .long 0x13DEF8C0
- vxor 16, 30, 27
- .long 0x13DEF8C0
- vxor 17, 30, 27
- .long 0x13DEF8C0
- vxor 18, 30, 27
- .long 0x13DEF8C0
- vxor 19, 30, 27
- .long 0x13DEF8C0
- vxor 20, 30, 27
- .long 0x13DEF8C0
- vxor 21, 30, 27
- .long 0x13DEF8C0
- vxor 22, 30, 27
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz .Loop_8x_block_dec
-
- vor 30,29,29
-
-.Loop_last_block_dec:
- cmpdi 12, 0
- beq aes_gcm_out
-
-
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10,240(6)
-
- cmpdi 12, 16
- blt Final_block_dec
-
-Next_rem_block_dec:
- lxvb16x 15, 0, 14
-
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_1x_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x_dec
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x_dec
-
-Do_next_1x_dec:
- .long 0x11EFBD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- addi 14, 14, 16
- addi 9, 9, 16
-
- xxlor 28+32, 15, 15
- ppc_update_hash_1x
+ vmr 15, 23
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- .long 0x13DEF8C0
- vxor 15, 30, 19
+ # ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
- bdnz Next_rem_block_dec
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+ addi 12, 12, -1
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block_dec:
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x_dec
-
+ bne __Loop_8x_block_dec
+
+__Finish_ghash_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
- xxlor 24+32, 13, 13
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
- .long 0x11EFBD08
- .long 0x11EFC508
+ vxor 15, 23, 0
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- xxlor 23+32, 14, 14
+ #vxor 15, 15, 0
+ PPC_GFMUL128_8x
- cmpdi 10, 14
- beq Do_final_1x_dec
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+0, 0, 8 # update Xi
-Do_final_1x_dec:
- .long 0x11EFBD09
+ addi 5, 5, -128
+ addi 11, 11, 128
- lxvb16x 15, 0, 14
- xxlxor 47, 47, 15
+ #
+ # Done 8x blocks
+ #
+ cmpdi 5, 0
+ beq aes_gcm_out
- li 15, 16
- sub 15, 15, 12
+__Process_more_dec:
+ li 24, 0 # decrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- vspltisb 16,-1
- vspltisb 17,0
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10
- xxland 47, 47, 16
+ bl __Process_partial
+ b aes_gcm_out
+.size ppc_aes_gcm_decrypt,.-ppc_aes_gcm_decrypt
- xxlor 28+32, 15, 15
- ppc_update_hash_1x
+aes_gcm_out:
+.localentry aes_gcm_out,0
+ mr 3, 11 # return count
- bl Write_partial_block
+ RESTORE_REGS
+ blr
+.size aes_gcm_out,.-aes_gcm_out
- b aes_gcm_out
+.rodata
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
diff --git a/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S b/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S
index 2ff143c42ab7..51cfac7e45fc 100644
--- a/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S
+++ b/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S
@@ -1,532 +1,587 @@
/* Do not modify. This file is auto-generated from aes-gcm-ppc.pl. */
-.machine "any"
-.abiversion 2
+.machine "any"
.text
-
-
-
-
-.macro .Loop_aes_middle4x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- .long 0x11EF9D08
- .long 0x12109D08
- .long 0x12319D08
- .long 0x12529D08
-
- .long 0x11EFA508
- .long 0x1210A508
- .long 0x1231A508
- .long 0x1252A508
-
- .long 0x11EFAD08
- .long 0x1210AD08
- .long 0x1231AD08
- .long 0x1252AD08
-
- .long 0x11EFB508
- .long 0x1210B508
- .long 0x1231B508
- .long 0x1252B508
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- .long 0x11EF9D08
- .long 0x12109D08
- .long 0x12319D08
- .long 0x12529D08
-
- .long 0x11EFA508
- .long 0x1210A508
- .long 0x1231A508
- .long 0x1252A508
-
- .long 0x11EFAD08
- .long 0x1210AD08
- .long 0x1231AD08
- .long 0x1252AD08
-
- .long 0x11EFB508
- .long 0x1210B508
- .long 0x1231B508
- .long 0x1252B508
-
- xxlor 23+32, 9, 9
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
-.endm
-
-
-
-
-
-.macro .Loop_aes_middle8x
- xxlor 23+32, 1, 1
- xxlor 24+32, 2, 2
- xxlor 25+32, 3, 3
- xxlor 26+32, 4, 4
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- .long 0x11EFCD08
- .long 0x1210CD08
- .long 0x1231CD08
- .long 0x1252CD08
- .long 0x1273CD08
- .long 0x1294CD08
- .long 0x12B5CD08
- .long 0x12D6CD08
-
- .long 0x11EFD508
- .long 0x1210D508
- .long 0x1231D508
- .long 0x1252D508
- .long 0x1273D508
- .long 0x1294D508
- .long 0x12B5D508
- .long 0x12D6D508
-
- xxlor 23+32, 5, 5
- xxlor 24+32, 6, 6
- xxlor 25+32, 7, 7
- xxlor 26+32, 8, 8
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- .long 0x11EFCD08
- .long 0x1210CD08
- .long 0x1231CD08
- .long 0x1252CD08
- .long 0x1273CD08
- .long 0x1294CD08
- .long 0x12B5CD08
- .long 0x12D6CD08
-
- .long 0x11EFD508
- .long 0x1210D508
- .long 0x1231D508
- .long 0x1252D508
- .long 0x1273D508
- .long 0x1294D508
- .long 0x12B5D508
- .long 0x12D6D508
-
- xxlor 23+32, 9, 9
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-.endm
-
-
-
-
-ppc_aes_gcm_ghash:
- vxor 15, 15, 0
-
- xxlxor 29, 29, 29
-
- .long 0x12EC7CC8
- .long 0x130984C8
- .long 0x13268CC8
- .long 0x134394C8
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26
-
- .long 0x130D7CC8
- .long 0x132A84C8
- .long 0x13478CC8
- .long 0x136494C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
-
- .long 0x130E7CC8
- .long 0x132B84C8
- .long 0x13488CC8
- .long 0x136594C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32
-
- blr
-
-
-
-
-
-.macro ppc_aes_gcm_ghash2_4x
-
- vxor 15, 15, 0
-
- xxlxor 29, 29, 29
-
- .long 0x12EC7CC8
- .long 0x130984C8
- .long 0x13268CC8
- .long 0x134394C8
+.macro SAVE_REGS
+ mflr 0
+ std 0, 16(1)
+ stdu 1,-512(1)
+
+ std 14, 112(1)
+ std 15, 120(1)
+ std 16, 128(1)
+ std 17, 136(1)
+ std 18, 144(1)
+ std 19, 152(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ std 24, 192(1)
+
+ stxv 32+20, 256(1)
+ stxv 32+21, 256+16(1)
+ stxv 32+22, 256+32(1)
+ stxv 32+23, 256+48(1)
+ stxv 32+24, 256+64(1)
+ stxv 32+25, 256+80(1)
+ stxv 32+26, 256+96(1)
+ stxv 32+27, 256+112(1)
+ stxv 32+28, 256+128(1)
+ stxv 32+29, 256+144(1)
+ stxv 32+30, 256+160(1)
+ stxv 32+31, 256+176(1)
+.endm # SAVE_REGS
+
+.macro RESTORE_REGS
+ lxv 32+20, 256(1)
+ lxv 32+21, 256+16(1)
+ lxv 32+22, 256+32(1)
+ lxv 32+23, 256+48(1)
+ lxv 32+24, 256+64(1)
+ lxv 32+25, 256+80(1)
+ lxv 32+26, 256+96(1)
+ lxv 32+27, 256+112(1)
+ lxv 32+28, 256+128(1)
+ lxv 32+29, 256+144(1)
+ lxv 32+30, 256+160(1)
+ lxv 32+31, 256+176(1)
+
+ ld 14, 112(1)
+ ld 15, 120(1)
+ ld 16, 128(1)
+ ld 17, 136(1)
+ ld 18, 144(1)
+ ld 19, 152(1)
+ ld 20, 160(1)
+ ld 21, 168(1)
+ ld 22, 176(1)
+ ld 23, 184(1)
+ ld 24, 192(1)
+
+ addi 1, 1, 512
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x r
+ vcipher 15, 15, \r
+ vcipher 16, 16, \r
+ vcipher 17, 17, \r
+ vcipher 18, 18, \r
+.endm
+
+# 8x loops
+.macro AES_CIPHER_8x r
+ vcipher 15, 15, \r
+ vcipher 16, 16, \r
+ vcipher 17, 17, \r
+ vcipher 18, 18, \r
+ vcipher 19, 19, \r
+ vcipher 20, 20, \r
+ vcipher 21, 21, \r
+ vcipher 22, 22, \r
+.endm
+
+.macro LOOP_8AES_STATE
+ AES_CIPHER_8x 23
+ AES_CIPHER_8x 24
+ AES_CIPHER_8x 25
+ AES_CIPHER_8x 26
+ AES_CIPHER_8x 27
+ AES_CIPHER_8x 28
+ AES_CIPHER_8x 29
+ AES_CIPHER_8x 1
+.endm
+
+#
+# PPC_GFMUL128_8x: Compute hash values of 8 blocks based on Karatsuba method.
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# vs10: vpermxor vector
+# Scratch: v23 - v29
+#
+.macro PPC_GFMUL128_8x
+
+ vpmsumd 23, 12, 15 # H4.L * X.L
+ vpmsumd 24, 9, 16
+ vpmsumd 25, 6, 17
+ vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
- vxor 23, 23, 26
-
- .long 0x130D7CC8
- .long 0x132A84C8
- .long 0x13478CC8
- .long 0x136494C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
-
- vxor 24, 24, 27
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
+ vxor 23, 23, 26 # L
- .long 0x130E7CC8
- .long 0x132B84C8
- .long 0x13488CC8
- .long 0x136594C8
+ vpmsumd 27, 13, 15 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, 16 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 25, 7, 17
+ vpmsumd 26, 4, 18
+ vxor 24, 27, 28
vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 27, 23, 27
-
-
- .long 0x1309A4C8
- .long 0x1326ACC8
- .long 0x1343B4C8
- vxor 19, 19, 27
- .long 0x12EC9CC8
+ vxor 24, 24, 26 # M
+
+ vpmsumd 26, 14, 15 # H4.H * X.H
+ vpmsumd 27, 11, 16
+ vpmsumd 28, 8, 17
+ vpmsumd 29, 5, 18
+
+ vxor 26, 26, 27
+ vxor 26, 26, 28
+ vxor 26, 26, 29
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+29, 10, 10
+ vpermxor 23, 23, 28, 29
+
+ vxor 24, 26, 1 # H
+
+ # sum hash and reduction with H Poly
+ #
+ # vsldoi 25, 23, 23, 8 # swap
+ # vpmsumd 23, 23, 2
+ # vxor 27, 25, 24
+ #
+ vpermxor 27, 23, 24, 29
+ vpmsumd 23, 23, 2
+ vxor 0, 23, 27 # Digest of 4 blocks
+
+ vxor 19, 19, 0
+
+ # Compute digest for the next 4 blocks
+ vpmsumd 24, 9, 20
+ vpmsumd 25, 6, 21
+ vpmsumd 26, 3, 22
+ vpmsumd 23, 12, 19 # H4.L * X.L
vxor 23, 23, 24
vxor 23, 23, 25
- vxor 23, 23, 26
+ vxor 23, 23, 26 # L
- .long 0x130D9CC8
- .long 0x132AA4C8
- .long 0x1347ACC8
- .long 0x1364B4C8
+ vpmsumd 27, 13, 19 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, 20 # H3.L * X1.H + H3.H * X1.L
+ vpmsumd 25, 7, 21
+ vpmsumd 26, 4, 22
+ vxor 24, 27, 28
vxor 24, 24, 25
- vxor 24, 24, 26
-
-
- .long 0x139714C8
-
- xxlor 29+32, 29, 29
-
- vxor 24, 24, 27
- vsldoi 26, 24, 29, 8
- vsldoi 29, 29, 24, 8
- vxor 23, 23, 26
-
- vsldoi 23, 23, 23, 8
- vxor 23, 23, 28
-
- .long 0x130E9CC8
- .long 0x132BA4C8
- .long 0x1348ACC8
- .long 0x1365B4C8
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
-
- vsldoi 27, 23, 23, 8
- .long 0x12F714C8
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32
-
-.endm
-
-
-
-
-.macro ppc_update_hash_1x
- vxor 28, 28, 0
-
- vxor 19, 19, 19
-
- .long 0x12C3E4C8
- .long 0x12E4E4C8
- .long 0x1305E4C8
-
- .long 0x137614C8
-
- vsldoi 25, 23, 19, 8
- vsldoi 26, 19, 23, 8
- vxor 22, 22, 25
- vxor 24, 24, 26
-
- vsldoi 22, 22, 22, 8
- vxor 22, 22, 27
-
- vsldoi 20, 22, 22, 8
- .long 0x12D614C8
- vxor 20, 20, 24
- vxor 22, 22, 20
-
- vor 0,22,22
-
-.endm
-
-
-
-
-
-
-
-
-
-
-
-
-
-.global ppc_aes_gcm_encrypt
-.align 5
-ppc_aes_gcm_encrypt:
-_ppc_aes_gcm_encrypt:
-
- stdu 1,-512(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- std 0, 528(1)
-
-
- lxvb16x 32, 0, 8
-
+ vxor 24, 24, 26 # M
+
+ vpmsumd 26, 14, 19 # H4.H * X.H
+ vpmsumd 27, 11, 20
+ vpmsumd 28, 8, 21
+ vpmsumd 29, 5, 22
+
+ vxor 26, 26, 27
+ vxor 26, 26, 28
+ vxor 26, 26, 29
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+29, 10, 10
+ vpermxor 23, 23, 28, 29
+
+ vxor 24, 26, 1 # H
+
+ # sum hash and reduction with H Poly
+ #
+ # vsldoi 25, 23, 23, 8 # swap
+ # vpmsumd 23, 23, 2
+ # vxor 27, 25, 24
+ #
+ vpermxor 27, 23, 24, 29
+ vpmsumd 23, 23, 2
+ vxor 0, 23, 27 # Digest of 8 blocks
+.endm
+
+#
+# Compute update single ghash
+# vs10: vpermxor vector
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
+
+ vxor 1, 1, 1
+
+ vpmsumd 22, 3, \S1 # L
+ vpmsumd 23, 4, \S1 # M
+ vpmsumd 24, 5, \S1 # H
+
+ vpmsumd 27, 22, 2 # reduction
+
+ vsldoi 25, 23, 1, 8 # mL
+ vsldoi 26, 1, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
+
+ xxlor 32+25, 10, 10
+ vpermxor 22, 22, 27, 25
+
+ # vsldoi 23, 22, 22, 8 # swap
+ # vpmsumd 22, 22, 2 # reduction
+ # vxor 23, 23, 24
+ vpermxor 23, 22, 24, 25
+ vpmsumd 22, 22, 2 # reduction
+
+ vxor \H, 22, 23
+.endm
+
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
+.macro LOAD_HASH_TABLE
+ # Load Xi
+ lxvb16x 32, 0, 8 # load Xi
+
+ vxor 1, 1, 1
li 10, 32
- lxvd2x 2+32, 10, 8
- li 10, 48
- lxvd2x 3+32, 10, 8
- li 10, 64
- lxvd2x 4+32, 10, 8
- li 10, 80
- lxvd2x 5+32, 10, 8
+ lxvd2x 2+32, 10, 8 # H Poli
- li 10, 96
- lxvd2x 6+32, 10, 8
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 64
+ lxvd2x 4+32, 10, 8 # H
+ vsldoi 3, 1, 4, 8 # l
+ vsldoi 5, 4, 1, 8 # h
li 10, 112
- lxvd2x 7+32, 10, 8
- li 10, 128
- lxvd2x 8+32, 10, 8
-
- li 10, 144
- lxvd2x 9+32, 10, 8
+ lxvd2x 7+32, 10, 8 # H^2
+ vsldoi 6, 1, 7, 8 # l
+ vsldoi 8, 7, 1, 8 # h
li 10, 160
- lxvd2x 10+32, 10, 8
- li 10, 176
- lxvd2x 11+32, 10, 8
-
- li 10, 192
- lxvd2x 12+32, 10, 8
+ lxvd2x 10+32, 10, 8 # H^3
+ vsldoi 9, 1, 10, 8 # l
+ vsldoi 11, 10, 1, 8 # h
li 10, 208
- lxvd2x 13+32, 10, 8
- li 10, 224
- lxvd2x 14+32, 10, 8
-
-
- lxvb16x 30+32, 0, 7
-
- mr 12, 5
- li 11, 0
-
-
- vxor 31, 31, 31
- vspltisb 22,1
- vsldoi 31, 31, 22,1
-
-
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
-
- lwz 9,240(6)
+ lxvd2x 13+32, 10, 8 # H^4
+ vsldoi 12, 1, 13, 8 # l
+ vsldoi 14, 13, 1, 8 # h
+.endm
+
+.macro PROCESS_8X_AES_STATES
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+.endm
+
+.macro COMPUTE_STATES
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
+.endm
+
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+.align 4
+aes_gcm_crypt_1x:
+.localentry aes_gcm_crypt_1x,0
+
+ cmpdi 5, 16
+ bge __More_1x
+ blr
+__More_1x:
+ li 10, 16
+ divdu 12, 5, 10
+
+ xxlxor 32+15, 32+30, 0
+
+ # Pre-load 8 AES rounds to scratch vectors.
+ lxv 32+16, 16(6) # round key 1
+ lxv 32+17, 32(6) # round key 2
+ lxv 32+18, 48(6) # round key 3
+ lxv 32+19, 64(6) # round key 4
+ lxv 32+20, 80(6) # round key 5
+ lxv 32+21, 96(6) # round key 6
+ lxv 32+28, 112(6) # round key 7
+ lxv 32+29, 128(6) # round key 8
+
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -9 # remaining AES rounds
+ cmpdi 12, 0
+ bgt __Loop_1x
+ blr
+
+__Loop_1x:
+ mtctr 22
+ addi 10, 6, 144
+ vcipher 15, 15, 16
+ vcipher 15, 15, 17
+ vcipher 15, 15, 18
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 28
+ vcipher 15, 15, 29
+
+__Loop_aes_1state:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_1state
+ lxv 32+1, 0(10) # last round key
+ lxvb16x 11, 0, 14 # load input block
+ vcipherlast 15, 15, 1
- xxlor 32+29, 0, 0
- vxor 15, 30, 29
+ xxlxor 32+15, 32+15, 11
+ stxvb16x 32+15, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
- cmpdi 9, 10
- beq .Loop_aes_gcm_8x
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_1x
+ xxlor 15+32, 11, 11
+__Encrypt_1x:
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+ addi 5, 5, -16
+ addi 11, 11, 16
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
+ vadduwm 30, 30, 31 # IV + counter
+ xxlxor 32+15, 32+30, 0
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bgt __Loop_1x
+
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+.size aes_gcm_crypt_1x,.-aes_gcm_crypt_1x
+
+################################################################################
+# Process a normal partial block when we come here.
+# Compute partial mask, Load and store partial block to stack.
+# Compute AES state.
+# Compute ghash.
+#
+################################################################################
+.align 4
+__Process_partial:
+.localentry __Process_partial,0
+
+ # create partial mask
+ vspltisb 16, -1
+ li 12, 16
+ sub 12, 12, 5
+ sldi 12, 12, 3
+ mtvsrdd 32+17, 0, 12
+ vslo 16, 16, 17 # partial block mask
+
+ lxvb16x 11, 0, 14 # load partial block
+ xxland 11, 11, 32+16
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_pstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_pstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
- cmpdi 9, 12
- beq .Loop_aes_gcm_8x
+ xxlxor 32+15, 32+15, 11
+ vand 15, 15, 16
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ addi 12, 9, -1
+ mtctr 5 # partial block len
+__Write_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(12)
+ bdnz __Write_partial
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_partial
+ xxlor 32+15, 11, 11 # decrypt using the input block
+__Encrypt_partial:
+ vxor 15, 15, 0 # ^ previous hash
+ PPC_GHASH1x 0, 15
+ li 5, 0 # done last byte
+ stxvb16x 32+0, 0, 8 # Update X1
+ blr
+.size __Process_partial,.-__Process_partial
+
+################################################################################
+# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+#
+# r3 - inp
+# r4 - out
+# r5 - len
+# r6 - AES round keys
+# r7 - iv
+# r8 - Xi, HPoli, hash keys
+#
+# rounds is at offset 240 in rk
+# Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+.global ppc_aes_gcm_encrypt
+.align 5
+ppc_aes_gcm_encrypt:
+.localentry ppc_aes_gcm_encrypt,0
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq .Loop_aes_gcm_8x
+ SAVE_REGS
+ LOAD_HASH_TABLE
- b aes_gcm_out
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
-.align 5
-.Loop_aes_gcm_8x:
mr 14, 3
mr 9, 4
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+ lxv 0, 0(6) # round key 0
+
+ #
+ # Process different blocks
+ #
+ cmpdi 5, 128
+ blt __Process_more_enc
+
+ # load 9 round keys
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+
+__Process_encrypt:
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+ # 8x blocks
li 10, 128
- divdu 10, 5, 10
- cmpdi 10, 0
- beq .Loop_last_block
-
- .long 0x13DEF8C0
- vxor 16, 30, 29
- .long 0x13DEF8C0
- vxor 17, 30, 29
- .long 0x13DEF8C0
- vxor 18, 30, 29
- .long 0x13DEF8C0
- vxor 19, 30, 29
- .long 0x13DEF8C0
- vxor 20, 30, 29
- .long 0x13DEF8C0
- vxor 21, 30, 29
- .long 0x13DEF8C0
- vxor 22, 30, 29
-
- mtctr 10
+ divdu 12, 5, 10 # n 128 bytes-blocks
+
+ addi 12, 12, -1 # loop - 1
+
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
li 15, 16
li 16, 32
@@ -536,523 +591,185 @@ _ppc_aes_gcm_encrypt:
li 20, 96
li 21, 112
- lwz 10, 240(6)
-
-.Loop_8x_block:
-
- lxvb16x 15, 0, 14
- lxvb16x 16, 15, 14
- lxvb16x 17, 16, 14
- lxvb16x 18, 17, 14
- lxvb16x 19, 18, 14
- lxvb16x 20, 19, 14
- lxvb16x 21, 20, 14
- lxvb16x 22, 21, 14
- addi 14, 14, 128
-
-.Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash
- b aes_gcm_out
-
-Do_next_ghash:
-
-
-
- .long 0x11EFBD09
- .long 0x1210BD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9
-
- .long 0x1231BD09
- .long 0x1252BD09
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9
-
- .long 0x1273BD09
- .long 0x1294BD09
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9
-
- .long 0x12B5BD09
- .long 0x12D6BD09
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9
-
- addi 9, 9, 128
-
-
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- .long 0x13DEF8C0
- vor 29,30,30
- vxor 15, 30, 27
- .long 0x13DEF8C0
- vxor 16, 30, 27
- .long 0x13DEF8C0
- vxor 17, 30, 27
- .long 0x13DEF8C0
- vxor 18, 30, 27
- .long 0x13DEF8C0
- vxor 19, 30, 27
- .long 0x13DEF8C0
- vxor 20, 30, 27
- .long 0x13DEF8C0
- vxor 21, 30, 27
- .long 0x13DEF8C0
- vxor 22, 30, 27
-
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz .Loop_8x_block
-
- vor 30,29,29
-
-.Loop_last_block:
- cmpdi 12, 0
- beq aes_gcm_out
-
-
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block
-
-.macro .Loop_aes_middle_1x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- .long 0x11EF9D08
- .long 0x11EFA508
- .long 0x11EFAD08
- .long 0x11EFB508
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- .long 0x11EF9D08
- .long 0x11EFA508
- .long 0x11EFAD08
- .long 0x11EFB508
-
- xxlor 19+32, 9, 9
- .long 0x11EF9D08
-.endm
-
-Next_rem_block:
- lxvb16x 15, 0, 14
-
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- cmpdi 10, 10
- beq Do_next_1x
+ LOOP_8AES_STATE # process 8 AES keys
+__PreLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x
-
-Do_next_1x:
- .long 0x11EFBD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- addi 14, 14, 16
- addi 9, 9, 16
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash
- vor 28,15,15
- ppc_update_hash_1x
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+ PROCESS_8X_AES_STATES
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- .long 0x13DEF8C0
- vxor 15, 30, 19
+ # Compute ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
+
+ COMPUTE_STATES
+
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # Compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+
+__LastLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state
- bdnz Next_rem_block
+ lxv 32+1, 0(10) # last round key (v1)
+ addi 12, 12, -1
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block:
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x
+ bne __Loop_8x_block_enc
+ #
+ # Remainng blocks
+ #
+__Finish_ghash:
+ PROCESS_8X_AES_STATES
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_final_1x
-
-Do_final_1x:
- .long 0x11EFBD09
-
- lxvb16x 15, 0, 14
- xxlxor 47, 47, 15
-
-
- li 15, 16
- sub 15, 15, 12
+ # Compute ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
- vspltisb 16,-1
- vspltisb 17,0
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
+ # Update IV and Xi
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+0, 0, 8 # update Xi
- addi 10, 1, 192
- lxvb16x 16, 15, 10
- xxland 47, 47, 16
+ addi 5, 5, -128
+ addi 11, 11, 128
- vor 28,15,15
- ppc_update_hash_1x
+ #
+ # Done 8x blocks
+ #
+ cmpdi 5, 0
+ beq aes_gcm_out
- bl Write_partial_block
+__Process_more_enc:
+ li 24, 1 # encrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
+ bl __Process_partial
b aes_gcm_out
+.size ppc_aes_gcm_encrypt,.-ppc_aes_gcm_encrypt
-
-
-
-
-
-Write_partial_block:
- li 10, 192
- stxvb16x 15+32, 10, 1
-
-
- addi 10, 9, -1
- addi 16, 1, 191
-
- mtctr 12
- li 15, 0
-
-Write_last_byte:
- lbzu 14, 1(16)
- stbu 14, 1(10)
- bdnz Write_last_byte
- blr
-
-aes_gcm_out:
-
- stxvb16x 32, 0, 8
- add 3, 11, 12
-
- li 9, 256
- lvx 20, 9, 1
- addi 9, 9, 16
- lvx 21, 9, 1
- addi 9, 9, 16
- lvx 22, 9, 1
- addi 9, 9, 16
- lvx 23, 9, 1
- addi 9, 9, 16
- lvx 24, 9, 1
- addi 9, 9, 16
- lvx 25, 9, 1
- addi 9, 9, 16
- lvx 26, 9, 1
- addi 9, 9, 16
- lvx 27, 9, 1
- addi 9, 9, 16
- lvx 28, 9, 1
- addi 9, 9, 16
- lvx 29, 9, 1
- addi 9, 9, 16
- lvx 30, 9, 1
- addi 9, 9, 16
- lvx 31, 9, 1
-
- ld 0, 528(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
-
- mtlr 0
- addi 1, 1, 512
- blr
-
-
-
-
-.global ppc_aes_gcm_decrypt
-.align 5
+################################################################################
+# ppc_aes_gcm_decrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+.global ppc_aes_gcm_decrypt
+.align 5
ppc_aes_gcm_decrypt:
-_ppc_aes_gcm_decrypt:
-
- stdu 1,-512(1)
- mflr 0
-
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- std 0, 528(1)
-
-
- lxvb16x 32, 0, 8
-
-
- li 10, 32
- lxvd2x 2+32, 10, 8
- li 10, 48
- lxvd2x 3+32, 10, 8
- li 10, 64
- lxvd2x 4+32, 10, 8
- li 10, 80
- lxvd2x 5+32, 10, 8
-
- li 10, 96
- lxvd2x 6+32, 10, 8
- li 10, 112
- lxvd2x 7+32, 10, 8
- li 10, 128
- lxvd2x 8+32, 10, 8
-
- li 10, 144
- lxvd2x 9+32, 10, 8
- li 10, 160
- lxvd2x 10+32, 10, 8
- li 10, 176
- lxvd2x 11+32, 10, 8
-
- li 10, 192
- lxvd2x 12+32, 10, 8
- li 10, 208
- lxvd2x 13+32, 10, 8
- li 10, 224
- lxvd2x 14+32, 10, 8
-
-
- lxvb16x 30+32, 0, 7
-
- mr 12, 5
- li 11, 0
-
+.localentry ppc_aes_gcm_decrypt, 0
- vxor 31, 31, 31
- vspltisb 22,1
- vsldoi 31, 31, 22,1
-
-
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
-
- lwz 9,240(6)
-
-
-
- xxlor 32+29, 0, 0
- vxor 15, 30, 29
-
- cmpdi 9, 10
- beq .Loop_aes_gcm_8x_dec
-
-
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq .Loop_aes_gcm_8x_dec
+ SAVE_REGS
+ LOAD_HASH_TABLE
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq .Loop_aes_gcm_8x_dec
-
- b aes_gcm_out
-
-.align 5
-.Loop_aes_gcm_8x_dec:
mr 14, 3
mr 9, 4
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+ lxv 0, 0(6) # round key 0
+
+ #
+ # Process different blocks
+ #
+ cmpdi 5, 128
+ blt __Process_more_dec
+
+ # load 9 round keys
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+
+__Process_decrypt:
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+ # 8x blocks
li 10, 128
- divdu 10, 5, 10
- cmpdi 10, 0
- beq .Loop_last_block_dec
-
- .long 0x13DEF8C0
- vxor 16, 30, 29
- .long 0x13DEF8C0
- vxor 17, 30, 29
- .long 0x13DEF8C0
- vxor 18, 30, 29
- .long 0x13DEF8C0
- vxor 19, 30, 29
- .long 0x13DEF8C0
- vxor 20, 30, 29
- .long 0x13DEF8C0
- vxor 21, 30, 29
- .long 0x13DEF8C0
- vxor 22, 30, 29
-
- mtctr 10
+ divdu 12, 5, 10 # n 128 bytes-blocks
+
+ addi 12, 12, -1 # loop - 1
+
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0
+ xxlxor 32+16, 32+16, 0
+ xxlxor 32+17, 32+17, 0
+ xxlxor 32+18, 32+18, 0
+ xxlxor 32+19, 32+19, 0
+ xxlxor 32+20, 32+20, 0
+ xxlxor 32+21, 32+21, 0
+ xxlxor 32+22, 32+22, 0
li 15, 16
li 16, 32
@@ -1062,279 +779,215 @@ _ppc_aes_gcm_decrypt:
li 20, 96
li 21, 112
- lwz 10, 240(6)
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
-.Loop_8x_block_dec:
+ LOOP_8AES_STATE # process 8 AES keys
- lxvb16x 15, 0, 14
- lxvb16x 16, 15, 14
- lxvb16x 17, 16, 14
- lxvb16x 18, 17, 14
- lxvb16x 19, 18, 14
- lxvb16x 20, 19, 14
- lxvb16x 21, 20, 14
- lxvb16x 22, 21, 14
+__PreLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash_dec
+
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
addi 14, 14, 128
-.Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_last_aes_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_last_aes_dec
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x1210BD08
- .long 0x1231BD08
- .long 0x1252BD08
- .long 0x1273BD08
- .long 0x1294BD08
- .long 0x12B5BD08
- .long 0x12D6BD08
-
- .long 0x11EFC508
- .long 0x1210C508
- .long 0x1231C508
- .long 0x1252C508
- .long 0x1273C508
- .long 0x1294C508
- .long 0x12B5C508
- .long 0x12D6C508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_last_aes_dec
- b aes_gcm_out
-
-Do_last_aes_dec:
-
-
-
- .long 0x11EFBD09
- .long 0x1210BD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9
-
- .long 0x1231BD09
- .long 0x1252BD09
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9
-
- .long 0x1273BD09
- .long 0x1294BD09
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9
-
- .long 0x12B5BD09
- .long 0x12D6BD09
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
- xxlor 15+32, 15, 15
- xxlor 16+32, 16, 16
- xxlor 17+32, 17, 17
- xxlor 18+32, 18, 18
- xxlor 19+32, 19, 19
- xxlor 20+32, 20, 20
- xxlor 21+32, 21, 21
- xxlor 22+32, 22, 22
-
-
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- .long 0x13DEF8C0
- vor 29,30,30
- vxor 15, 30, 27
- .long 0x13DEF8C0
- vxor 16, 30, 27
- .long 0x13DEF8C0
- vxor 17, 30, 27
- .long 0x13DEF8C0
- vxor 18, 30, 27
- .long 0x13DEF8C0
- vxor 19, 30, 27
- .long 0x13DEF8C0
- vxor 20, 30, 27
- .long 0x13DEF8C0
- vxor 21, 30, 27
- .long 0x13DEF8C0
- vxor 22, 30, 27
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz .Loop_8x_block_dec
-
- vor 30,29,29
-
-.Loop_last_block_dec:
- cmpdi 12, 0
- beq aes_gcm_out
-
-
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10,240(6)
-
- cmpdi 12, 16
- blt Final_block_dec
-
-Next_rem_block_dec:
- lxvb16x 15, 0, 14
-
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_1x_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x_dec
-
-
- xxlor 24+32, 13, 13
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x_dec
-
-Do_next_1x_dec:
- .long 0x11EFBD09
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9
- addi 14, 14, 16
- addi 9, 9, 16
-
- xxlor 28+32, 15, 15
- ppc_update_hash_1x
+ vmr 15, 23
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- .long 0x13DEF8C0
- vxor 15, 30, 19
+ # ghash here
+ vxor 15, 15, 0
+ PPC_GFMUL128_8x
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
- bdnz Next_rem_block_dec
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ lxv 32+23, 16(6) # round key 1
+ lxv 32+24, 32(6) # round key 2
+ lxv 32+25, 48(6) # round key 3
+ lxv 32+26, 64(6) # round key 4
+ lxv 32+27, 80(6) # round key 5
+ lxv 32+28, 96(6) # round key 6
+ lxv 32+29, 112(6) # round key 7
+ lxv 32+1, 128(6) # round key 8
+
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+ addi 12, 12, -1
cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block_dec:
-.Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x_dec
-
-
- xxlor 24+32, 11, 11
-
- .long 0x11EFBD08
- .long 0x11EFC508
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x_dec
-
+ bne __Loop_8x_block_dec
+
+__Finish_ghash_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
- xxlor 24+32, 13, 13
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
- .long 0x11EFBD08
- .long 0x11EFC508
+ vxor 15, 23, 0
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
- xxlor 23+32, 14, 14
+ #vxor 15, 15, 0
+ PPC_GFMUL128_8x
- cmpdi 10, 14
- beq Do_final_1x_dec
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+0, 0, 8 # update Xi
-Do_final_1x_dec:
- .long 0x11EFBD09
+ addi 5, 5, -128
+ addi 11, 11, 128
- lxvb16x 15, 0, 14
- xxlxor 47, 47, 15
+ #
+ # Done 8x blocks
+ #
+ cmpdi 5, 0
+ beq aes_gcm_out
- li 15, 16
- sub 15, 15, 12
+__Process_more_dec:
+ li 24, 0 # decrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- vspltisb 16,-1
- vspltisb 17,0
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10
- xxland 47, 47, 16
+ bl __Process_partial
+ b aes_gcm_out
+.size ppc_aes_gcm_decrypt,.-ppc_aes_gcm_decrypt
- xxlor 28+32, 15, 15
- ppc_update_hash_1x
+aes_gcm_out:
+.localentry aes_gcm_out,0
+ mr 3, 11 # return count
- bl Write_partial_block
+ RESTORE_REGS
+ blr
+.size aes_gcm_out,.-aes_gcm_out
- b aes_gcm_out
+.rodata
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3