diff options
| author | Enji Cooper <ngie@FreeBSD.org> | 2026-02-01 17:05:55 +0000 |
|---|---|---|
| committer | Enji Cooper <ngie@FreeBSD.org> | 2026-02-01 17:05:55 +0000 |
| commit | e6c8997a8958c7aaec8e266d2eeefbfaa137e218 (patch) | |
| tree | 568ad37498693f81a2c62c5ec1b53747e5a8c9e9 | |
| parent | a97ed3a39c1044dd1b8056d68a76de74821f2bff (diff) | |
OpenSSL: commit sys/crypto changes for 3.5.5
These files were changed as part of the 3.5.4 -> 3.5.5 upgrade. Please
see the upstream release notes linked in
1731fc70f7344af08db49b06c63c963fa12ee354, et al, for more details.
MFC after: 6 days
MFC with: 1731fc70f7344af08db49b06c63c963fa12ee354
Fixes: 1731fc70f7344af08d ("OpenSSL: update vendor sources to match 3.5.5 content")
| -rw-r--r-- | sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S | 14 | ||||
| -rw-r--r-- | sys/crypto/openssl/arm_arch.h | 369 | ||||
| -rw-r--r-- | sys/crypto/openssl/powerpc/aes-gcm-ppc.S | 2118 | ||||
| -rw-r--r-- | sys/crypto/openssl/powerpc64/aes-gcm-ppc.S | 2119 | ||||
| -rw-r--r-- | sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S | 2119 |
5 files changed, 2850 insertions, 3889 deletions
diff --git a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S index 5627d6d1c6b4..b8c728e68683 100644 --- a/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S +++ b/sys/crypto/openssl/aarch64/vpsm4_ex-armv8.S @@ -1,5 +1,5 @@ /* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */ -// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. +// Copyright 2022-2026 The OpenSSL Project Authors. All Rights Reserved. // // Licensed under the Apache License 2.0 (the "License"). You may not use // this file except in compliance with the License. You can obtain a copy @@ -35,13 +35,25 @@ _vpsm4_ex_consts: .Lshuffles: .quad 0x0B0A090807060504,0x030201000F0E0D0C .Lxts_magic: +#ifndef __AARCH64EB__ .quad 0x0101010101010187,0x0101010101010101 +#else +.quad 0x0101010101010101,0x0101010101010187 +#endif .Lsbox_magic: +#ifndef __AARCH64EB__ .quad 0x0b0e0104070a0d00,0x0306090c0f020508 .quad 0x62185a2042387a00,0x22581a6002783a40 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc +#else +.quad 0x0306090c0f020508,0x0b0e0104070a0d00 +.quad 0x22581a6002783a40,0x62185a2042387a00 +.quad 0xc10bb67c4a803df7,0x15df62a89e54e923 +.quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300 +.quad 0xe383c1a1fe9edcbc,0x6404462679195b3b +#endif .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f .size _vpsm4_ex_consts,.-_vpsm4_ex_consts diff --git a/sys/crypto/openssl/arm_arch.h b/sys/crypto/openssl/arm_arch.h index acd8aee4d519..d570d1eba6c1 100644 --- a/sys/crypto/openssl/arm_arch.h +++ b/sys/crypto/openssl/arm_arch.h @@ -8,87 +8,80 @@ */ #ifndef OSSL_CRYPTO_ARM_ARCH_H -# define OSSL_CRYPTO_ARM_ARCH_H - -# if !defined(__ARM_ARCH__) -# if defined(__CC_ARM) -# define __ARM_ARCH__ __TARGET_ARCH_ARM -# if defined(__BIG_ENDIAN) -# define __ARMEB__ -# else -# define __ARMEL__ -# endif -# elif defined(__GNUC__) -# if defined(__aarch64__) -# define __ARM_ARCH__ 8 - /* - * Why doesn't gcc define __ARM_ARCH__? Instead it defines - * bunch of below macros. See all_architectures[] table in - * gcc/config/arm/arm.c. On a side note it defines - * __ARMEL__/__ARMEB__ for little-/big-endian. - */ -# elif defined(__ARM_ARCH) -# define __ARM_ARCH__ __ARM_ARCH -# elif defined(__ARM_ARCH_8A__) -# define __ARM_ARCH__ 8 -# elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ - defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ - defined(__ARM_ARCH_7EM__) -# define __ARM_ARCH__ 7 -# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ - defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ - defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ - defined(__ARM_ARCH_6T2__) -# define __ARM_ARCH__ 6 -# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ - defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ - defined(__ARM_ARCH_5TEJ__) -# define __ARM_ARCH__ 5 -# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) -# define __ARM_ARCH__ 4 -# else -# error "unsupported ARM architecture" -# endif -# elif defined(__ARM_ARCH) -# define __ARM_ARCH__ __ARM_ARCH -# endif -# endif - -# if !defined(__ARM_MAX_ARCH__) -# define __ARM_MAX_ARCH__ __ARM_ARCH__ -# endif - -# if __ARM_MAX_ARCH__<__ARM_ARCH__ -# error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" -# elif __ARM_MAX_ARCH__!=__ARM_ARCH__ -# if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__) -# error "can't build universal big-endian binary" -# endif -# endif - -# ifndef __ASSEMBLER__ +#define OSSL_CRYPTO_ARM_ARCH_H + +#if !defined(__ARM_ARCH__) +#if defined(__CC_ARM) +#define __ARM_ARCH__ __TARGET_ARCH_ARM +#if defined(__BIG_ENDIAN) +#define __ARMEB__ +#else +#define __ARMEL__ +#endif +#elif defined(__GNUC__) +#if defined(__aarch64__) +#define __ARM_ARCH__ 8 +/* + * Why doesn't gcc define __ARM_ARCH__? Instead it defines + * bunch of below macros. See all_architectures[] table in + * gcc/config/arm/arm.c. On a side note it defines + * __ARMEL__/__ARMEB__ for little-/big-endian. + */ +#elif defined(__ARM_ARCH) +#define __ARM_ARCH__ __ARM_ARCH +#elif defined(__ARM_ARCH_8A__) +#define __ARM_ARCH__ 8 +#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) +#define __ARM_ARCH__ 7 +#elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) +#define __ARM_ARCH__ 6 +#elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) +#define __ARM_ARCH__ 5 +#elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +#define __ARM_ARCH__ 4 +#else +#error "unsupported ARM architecture" +#endif +#elif defined(__ARM_ARCH) +#define __ARM_ARCH__ __ARM_ARCH +#endif +#endif + +#if !defined(__ARM_MAX_ARCH__) +#define __ARM_MAX_ARCH__ __ARM_ARCH__ +#endif + +#if __ARM_MAX_ARCH__ < __ARM_ARCH__ +#error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" +#elif __ARM_MAX_ARCH__ != __ARM_ARCH__ +#if __ARM_ARCH__ < 7 && __ARM_MAX_ARCH__ >= 7 && defined(__ARMEB__) +#error "can't build universal big-endian binary" +#endif +#endif + +#ifndef __ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; extern unsigned int OPENSSL_arm_midr; extern unsigned int OPENSSL_armv8_rsa_neonized; -# endif - -# define ARMV7_NEON (1<<0) -# define ARMV7_TICK (1<<1) -# define ARMV8_AES (1<<2) -# define ARMV8_SHA1 (1<<3) -# define ARMV8_SHA256 (1<<4) -# define ARMV8_PMULL (1<<5) -# define ARMV8_SHA512 (1<<6) -# define ARMV8_CPUID (1<<7) -# define ARMV8_RNG (1<<8) -# define ARMV8_SM3 (1<<9) -# define ARMV8_SM4 (1<<10) -# define ARMV8_SHA3 (1<<11) -# define ARMV8_UNROLL8_EOR3 (1<<12) -# define ARMV8_SVE (1<<13) -# define ARMV8_SVE2 (1<<14) -# define ARMV8_HAVE_SHA3_AND_WORTH_USING (1<<15) -# define ARMV8_UNROLL12_EOR3 (1<<16) +#endif + +#define ARMV7_NEON (1 << 0) +#define ARMV7_TICK (1 << 1) +#define ARMV8_AES (1 << 2) +#define ARMV8_SHA1 (1 << 3) +#define ARMV8_SHA256 (1 << 4) +#define ARMV8_PMULL (1 << 5) +#define ARMV8_SHA512 (1 << 6) +#define ARMV8_CPUID (1 << 7) +#define ARMV8_RNG (1 << 8) +#define ARMV8_SM3 (1 << 9) +#define ARMV8_SM4 (1 << 10) +#define ARMV8_SHA3 (1 << 11) +#define ARMV8_UNROLL8_EOR3 (1 << 12) +#define ARMV8_SVE (1 << 13) +#define ARMV8_SVE2 (1 << 14) +#define ARMV8_HAVE_SHA3_AND_WORTH_USING (1 << 15) +#define ARMV8_UNROLL12_EOR3 (1 << 16) /* * MIDR_EL1 system register @@ -100,120 +93,116 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; * */ -# define ARM_CPU_IMP_ARM 0x41 -# define HISI_CPU_IMP 0x48 -# define ARM_CPU_IMP_APPLE 0x61 -# define ARM_CPU_IMP_MICROSOFT 0x6D -# define ARM_CPU_IMP_AMPERE 0xC0 - -# define ARM_CPU_PART_CORTEX_A72 0xD08 -# define ARM_CPU_PART_N1 0xD0C -# define ARM_CPU_PART_V1 0xD40 -# define ARM_CPU_PART_N2 0xD49 -# define HISI_CPU_PART_KP920 0xD01 -# define ARM_CPU_PART_V2 0xD4F - -# define APPLE_CPU_PART_M1_ICESTORM 0x022 -# define APPLE_CPU_PART_M1_FIRESTORM 0x023 -# define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 -# define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 -# define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 -# define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 -# define APPLE_CPU_PART_M2_BLIZZARD 0x032 -# define APPLE_CPU_PART_M2_AVALANCHE 0x033 -# define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034 -# define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035 -# define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038 -# define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 - -# define MICROSOFT_CPU_PART_COBALT_100 0xD49 - -# define MIDR_PARTNUM_SHIFT 4 -# define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) -# define MIDR_PARTNUM(midr) \ - (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) - -# define MIDR_IMPLEMENTER_SHIFT 24 -# define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) -# define MIDR_IMPLEMENTER(midr) \ - (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) - -# define MIDR_ARCHITECTURE_SHIFT 16 -# define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) -# define MIDR_ARCHITECTURE(midr) \ - (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) - -# define MIDR_CPU_MODEL_MASK \ - (MIDR_IMPLEMENTER_MASK | \ - MIDR_PARTNUM_MASK | \ - MIDR_ARCHITECTURE_MASK) - -# define MIDR_CPU_MODEL(imp, partnum) \ - (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ - (0xfU << MIDR_ARCHITECTURE_SHIFT) | \ - ((partnum) << MIDR_PARTNUM_SHIFT)) - -# define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ - (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) +#define ARM_CPU_IMP_ARM 0x41 +#define HISI_CPU_IMP 0x48 +#define ARM_CPU_IMP_APPLE 0x61 +#define ARM_CPU_IMP_MICROSOFT 0x6D +#define ARM_CPU_IMP_AMPERE 0xC0 + +#define ARM_CPU_PART_CORTEX_A72 0xD08 +#define ARM_CPU_PART_N1 0xD0C +#define ARM_CPU_PART_V1 0xD40 +#define ARM_CPU_PART_N2 0xD49 +#define HISI_CPU_PART_KP920 0xD01 +#define ARM_CPU_PART_V2 0xD4F + +#define APPLE_CPU_PART_M1_ICESTORM 0x022 +#define APPLE_CPU_PART_M1_FIRESTORM 0x023 +#define APPLE_CPU_PART_M1_ICESTORM_PRO 0x024 +#define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 +#define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 +#define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 +#define APPLE_CPU_PART_M2_BLIZZARD 0x032 +#define APPLE_CPU_PART_M2_AVALANCHE 0x033 +#define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034 +#define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035 +#define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038 +#define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 + +#define MICROSOFT_CPU_PART_COBALT_100 0xD49 + +#define MIDR_PARTNUM_SHIFT 4 +#define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) +#define MIDR_PARTNUM(midr) \ + (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) + +#define MIDR_IMPLEMENTER_SHIFT 24 +#define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) +#define MIDR_IMPLEMENTER(midr) \ + (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) + +#define MIDR_ARCHITECTURE_SHIFT 16 +#define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) +#define MIDR_ARCHITECTURE(midr) \ + (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) + +#define MIDR_CPU_MODEL_MASK \ + (MIDR_IMPLEMENTER_MASK | MIDR_PARTNUM_MASK | MIDR_ARCHITECTURE_MASK) + +#define MIDR_CPU_MODEL(imp, partnum) \ + (((imp) << MIDR_IMPLEMENTER_SHIFT) | (0xfU << MIDR_ARCHITECTURE_SHIFT) | ((partnum) << MIDR_PARTNUM_SHIFT)) + +#define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) #if defined(__ASSEMBLER__) - /* - * Support macros for - * - Armv8.3-A Pointer Authentication and - * - Armv8.5-A Branch Target Identification - * features which require emitting a .note.gnu.property section with the - * appropriate architecture-dependent feature bits set. - * Read more: "ELF for the ArmĀ® 64-bit Architecture" - */ - -# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 -# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ -# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ -# else -# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ -# define AARCH64_VALID_CALL_TARGET -# endif - -# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ - (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ -# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ - (1 << 1) /* Has Pointer Authentication */ -# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ -# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ -# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ - (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ -# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ - (1 << 1) /* Has Pointer Authentication */ -# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ -# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ -# else -# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ -# if GNU_PROPERTY_AARCH64_BTI != 0 -# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET -# else -# define AARCH64_SIGN_LINK_REGISTER -# endif -# define AARCH64_VALIDATE_LINK_REGISTER -# endif - -# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 - .pushsection .note.gnu.property, "a"; - .balign 8; - .long 4; - .long 0x10; - .long 0x5; - .asciz "GNU"; - .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ - .long 4; - .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); - .long 0; - .popsection; -# endif - -# endif /* defined __ASSEMBLER__ */ - -# define IS_CPU_SUPPORT_UNROLL8_EOR3() \ - (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3) +/* + * Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * Read more: "ELF for the ArmĀ® 64-bit Architecture" + */ + +#if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +#define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ +#define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ +#else +#define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ +#define AARCH64_VALID_CALL_TARGET +#endif + +#if defined(__ARM_FEATURE_PAC_DEFAULT) && (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +#define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ +#define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ +#elif defined(__ARM_FEATURE_PAC_DEFAULT) && (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ +#define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +#define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ +#define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ +#else +#define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ +#if GNU_PROPERTY_AARCH64_BTI != 0 +#define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +#else +#define AARCH64_SIGN_LINK_REGISTER +#endif +#define AARCH64_VALIDATE_LINK_REGISTER +#endif + +#if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 +/* clang-format off */ +.pushsection .note.gnu.property, "a"; +/* clang-format on */ +.balign 8; +.long 4; +.long 0x10; +.long 0x5; +.asciz "GNU"; +.long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ +.long 4; +.long(GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); +.long 0; +.popsection; +#endif + +#endif /* defined __ASSEMBLER__ */ + +#define IS_CPU_SUPPORT_UNROLL8_EOR3() \ + (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3) #endif diff --git a/sys/crypto/openssl/powerpc/aes-gcm-ppc.S b/sys/crypto/openssl/powerpc/aes-gcm-ppc.S index 23a8feb24745..51cfac7e45fc 100644 --- a/sys/crypto/openssl/powerpc/aes-gcm-ppc.S +++ b/sys/crypto/openssl/powerpc/aes-gcm-ppc.S @@ -1,531 +1,587 @@ /* Do not modify. This file is auto-generated from aes-gcm-ppc.pl. */ -.machine "any" +.machine "any" .text - - - - -.macro .Loop_aes_middle4x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - .long 0x11EF9D08 - .long 0x12109D08 - .long 0x12319D08 - .long 0x12529D08 - - .long 0x11EFA508 - .long 0x1210A508 - .long 0x1231A508 - .long 0x1252A508 - - .long 0x11EFAD08 - .long 0x1210AD08 - .long 0x1231AD08 - .long 0x1252AD08 - - .long 0x11EFB508 - .long 0x1210B508 - .long 0x1231B508 - .long 0x1252B508 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - .long 0x11EF9D08 - .long 0x12109D08 - .long 0x12319D08 - .long 0x12529D08 - - .long 0x11EFA508 - .long 0x1210A508 - .long 0x1231A508 - .long 0x1252A508 - - .long 0x11EFAD08 - .long 0x1210AD08 - .long 0x1231AD08 - .long 0x1252AD08 - - .long 0x11EFB508 - .long 0x1210B508 - .long 0x1231B508 - .long 0x1252B508 - - xxlor 23+32, 9, 9 - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 -.endm - - - - - -.macro .Loop_aes_middle8x - xxlor 23+32, 1, 1 - xxlor 24+32, 2, 2 - xxlor 25+32, 3, 3 - xxlor 26+32, 4, 4 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - - xxlor 23+32, 5, 5 - xxlor 24+32, 6, 6 - xxlor 25+32, 7, 7 - xxlor 26+32, 8, 8 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - - xxlor 23+32, 9, 9 - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 -.endm - - - - -ppc_aes_gcm_ghash: - vxor 15, 15, 0 - - xxlxor 29, 29, 29 - - .long 0x12EC7CC8 - .long 0x130984C8 - .long 0x13268CC8 - .long 0x134394C8 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 - - .long 0x130D7CC8 - .long 0x132A84C8 - .long 0x13478CC8 - .long 0x136494C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 - - .long 0x130E7CC8 - .long 0x132B84C8 - .long 0x13488CC8 - .long 0x136594C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 - - blr - - - - - -.macro ppc_aes_gcm_ghash2_4x - - vxor 15, 15, 0 - - xxlxor 29, 29, 29 - - .long 0x12EC7CC8 - .long 0x130984C8 - .long 0x13268CC8 - .long 0x134394C8 +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-512(1) + + std 14, 112(1) + std 15, 120(1) + std 16, 128(1) + std 17, 136(1) + std 18, 144(1) + std 19, 152(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + std 24, 192(1) + + stxv 32+20, 256(1) + stxv 32+21, 256+16(1) + stxv 32+22, 256+32(1) + stxv 32+23, 256+48(1) + stxv 32+24, 256+64(1) + stxv 32+25, 256+80(1) + stxv 32+26, 256+96(1) + stxv 32+27, 256+112(1) + stxv 32+28, 256+128(1) + stxv 32+29, 256+144(1) + stxv 32+30, 256+160(1) + stxv 32+31, 256+176(1) +.endm # SAVE_REGS + +.macro RESTORE_REGS + lxv 32+20, 256(1) + lxv 32+21, 256+16(1) + lxv 32+22, 256+32(1) + lxv 32+23, 256+48(1) + lxv 32+24, 256+64(1) + lxv 32+25, 256+80(1) + lxv 32+26, 256+96(1) + lxv 32+27, 256+112(1) + lxv 32+28, 256+128(1) + lxv 32+29, 256+144(1) + lxv 32+30, 256+160(1) + lxv 32+31, 256+176(1) + + ld 14, 112(1) + ld 15, 120(1) + ld 16, 128(1) + ld 17, 136(1) + ld 18, 144(1) + ld 19, 152(1) + ld 20, 160(1) + ld 21, 168(1) + ld 22, 176(1) + ld 23, 184(1) + ld 24, 192(1) + + addi 1, 1, 512 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# 4x loops +.macro AES_CIPHER_4x r + vcipher 15, 15, \r + vcipher 16, 16, \r + vcipher 17, 17, \r + vcipher 18, 18, \r +.endm + +# 8x loops +.macro AES_CIPHER_8x r + vcipher 15, 15, \r + vcipher 16, 16, \r + vcipher 17, 17, \r + vcipher 18, 18, \r + vcipher 19, 19, \r + vcipher 20, 20, \r + vcipher 21, 21, \r + vcipher 22, 22, \r +.endm + +.macro LOOP_8AES_STATE + AES_CIPHER_8x 23 + AES_CIPHER_8x 24 + AES_CIPHER_8x 25 + AES_CIPHER_8x 26 + AES_CIPHER_8x 27 + AES_CIPHER_8x 28 + AES_CIPHER_8x 29 + AES_CIPHER_8x 1 +.endm + +# +# PPC_GFMUL128_8x: Compute hash values of 8 blocks based on Karatsuba method. +# +# S1 should xor with the previous digest +# +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# vs10: vpermxor vector +# Scratch: v23 - v29 +# +.macro PPC_GFMUL128_8x + + vpmsumd 23, 12, 15 # H4.L * X.L + vpmsumd 24, 9, 16 + vpmsumd 25, 6, 17 + vpmsumd 26, 3, 18 vxor 23, 23, 24 vxor 23, 23, 25 - vxor 23, 23, 26 - - .long 0x130D7CC8 - .long 0x132A84C8 - .long 0x13478CC8 - .long 0x136494C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - - vxor 24, 24, 27 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 + vxor 23, 23, 26 # L - .long 0x130E7CC8 - .long 0x132B84C8 - .long 0x13488CC8 - .long 0x136594C8 + vpmsumd 27, 13, 15 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, 16 # H3.L * X1.H + H3.H * X1.L + vpmsumd 25, 7, 17 + vpmsumd 26, 4, 18 + vxor 24, 27, 28 vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 27, 23, 27 - - - .long 0x1309A4C8 - .long 0x1326ACC8 - .long 0x1343B4C8 - vxor 19, 19, 27 - .long 0x12EC9CC8 + vxor 24, 24, 26 # M + + vpmsumd 26, 14, 15 # H4.H * X.H + vpmsumd 27, 11, 16 + vpmsumd 28, 8, 17 + vpmsumd 29, 5, 18 + + vxor 26, 26, 27 + vxor 26, 26, 28 + vxor 26, 26, 29 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L + + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+29, 10, 10 + vpermxor 23, 23, 28, 29 + + vxor 24, 26, 1 # H + + # sum hash and reduction with H Poly + # + # vsldoi 25, 23, 23, 8 # swap + # vpmsumd 23, 23, 2 + # vxor 27, 25, 24 + # + vpermxor 27, 23, 24, 29 + vpmsumd 23, 23, 2 + vxor 0, 23, 27 # Digest of 4 blocks + + vxor 19, 19, 0 + + # Compute digest for the next 4 blocks + vpmsumd 24, 9, 20 + vpmsumd 25, 6, 21 + vpmsumd 26, 3, 22 + vpmsumd 23, 12, 19 # H4.L * X.L vxor 23, 23, 24 vxor 23, 23, 25 - vxor 23, 23, 26 + vxor 23, 23, 26 # L - .long 0x130D9CC8 - .long 0x132AA4C8 - .long 0x1347ACC8 - .long 0x1364B4C8 + vpmsumd 27, 13, 19 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, 20 # H3.L * X1.H + H3.H * X1.L + vpmsumd 25, 7, 21 + vpmsumd 26, 4, 22 + vxor 24, 27, 28 vxor 24, 24, 25 - vxor 24, 24, 26 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - - vxor 24, 24, 27 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 - - .long 0x130E9CC8 - .long 0x132BA4C8 - .long 0x1348ACC8 - .long 0x1365B4C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 - -.endm - - - - -.macro ppc_update_hash_1x - vxor 28, 28, 0 - - vxor 19, 19, 19 - - .long 0x12C3E4C8 - .long 0x12E4E4C8 - .long 0x1305E4C8 - - .long 0x137614C8 - - vsldoi 25, 23, 19, 8 - vsldoi 26, 19, 23, 8 - vxor 22, 22, 25 - vxor 24, 24, 26 - - vsldoi 22, 22, 22, 8 - vxor 22, 22, 27 - - vsldoi 20, 22, 22, 8 - .long 0x12D614C8 - vxor 20, 20, 24 - vxor 22, 22, 20 - - vor 0,22,22 - -.endm - - - - - - - - - - - - - -.global ppc_aes_gcm_encrypt -.align 5 -ppc_aes_gcm_encrypt: -_ppc_aes_gcm_encrypt: - - stdu 1,-512(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - std 0, 528(1) - - - lxvb16x 32, 0, 8 - + vxor 24, 24, 26 # M + + vpmsumd 26, 14, 19 # H4.H * X.H + vpmsumd 27, 11, 20 + vpmsumd 28, 8, 21 + vpmsumd 29, 5, 22 + + vxor 26, 26, 27 + vxor 26, 26, 28 + vxor 26, 26, 29 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L + + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+29, 10, 10 + vpermxor 23, 23, 28, 29 + + vxor 24, 26, 1 # H + + # sum hash and reduction with H Poly + # + # vsldoi 25, 23, 23, 8 # swap + # vpmsumd 23, 23, 2 + # vxor 27, 25, 24 + # + vpermxor 27, 23, 24, 29 + vpmsumd 23, 23, 2 + vxor 0, 23, 27 # Digest of 8 blocks +.endm + +# +# Compute update single ghash +# vs10: vpermxor vector +# scratch: v1, v22..v27 +# +.macro PPC_GHASH1x H S1 + + vxor 1, 1, 1 + + vpmsumd 22, 3, \S1 # L + vpmsumd 23, 4, \S1 # M + vpmsumd 24, 5, \S1 # H + + vpmsumd 27, 22, 2 # reduction + + vsldoi 25, 23, 1, 8 # mL + vsldoi 26, 1, 23, 8 # mH + vxor 22, 22, 25 # LL + LL + vxor 24, 24, 26 # HH + HH + + xxlor 32+25, 10, 10 + vpermxor 22, 22, 27, 25 + + # vsldoi 23, 22, 22, 8 # swap + # vpmsumd 22, 22, 2 # reduction + # vxor 23, 23, 24 + vpermxor 23, 22, 24, 25 + vpmsumd 22, 22, 2 # reduction + + vxor \H, 22, 23 +.endm + +# +# LOAD_HASH_TABLE +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# +.macro LOAD_HASH_TABLE + # Load Xi + lxvb16x 32, 0, 8 # load Xi + + vxor 1, 1, 1 li 10, 32 - lxvd2x 2+32, 10, 8 - li 10, 48 - lxvd2x 3+32, 10, 8 - li 10, 64 - lxvd2x 4+32, 10, 8 - li 10, 80 - lxvd2x 5+32, 10, 8 + lxvd2x 2+32, 10, 8 # H Poli - li 10, 96 - lxvd2x 6+32, 10, 8 + # load Hash - h^4, h^3, h^2, h + li 10, 64 + lxvd2x 4+32, 10, 8 # H + vsldoi 3, 1, 4, 8 # l + vsldoi 5, 4, 1, 8 # h li 10, 112 - lxvd2x 7+32, 10, 8 - li 10, 128 - lxvd2x 8+32, 10, 8 - - li 10, 144 - lxvd2x 9+32, 10, 8 + lxvd2x 7+32, 10, 8 # H^2 + vsldoi 6, 1, 7, 8 # l + vsldoi 8, 7, 1, 8 # h li 10, 160 - lxvd2x 10+32, 10, 8 - li 10, 176 - lxvd2x 11+32, 10, 8 - - li 10, 192 - lxvd2x 12+32, 10, 8 + lxvd2x 10+32, 10, 8 # H^3 + vsldoi 9, 1, 10, 8 # l + vsldoi 11, 10, 1, 8 # h li 10, 208 - lxvd2x 13+32, 10, 8 - li 10, 224 - lxvd2x 14+32, 10, 8 - - - lxvb16x 30+32, 0, 7 - - mr 12, 5 - li 11, 0 - - - vxor 31, 31, 31 - vspltisb 22,1 - vsldoi 31, 31, 22,1 - - - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - - lwz 9,240(6) + lxvd2x 13+32, 10, 8 # H^4 + vsldoi 12, 1, 13, 8 # l + vsldoi 14, 13, 1, 8 # h +.endm + +.macro PROCESS_8X_AES_STATES + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block + addi 14, 14, 128 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output + addi 9, 9, 128 +.endm + +.macro COMPUTE_STATES + xxlor 32+15, 9, 9 # last state + vadduwm 15, 15, 31 # state + counter + vadduwm 16, 15, 31 + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 +.endm + +################################################################################ +# Compute AES and ghash one block at a time. +# r23: AES rounds +# v30: current IV +# vs0: roundkey 0 +# +################################################################################ +.align 4 +aes_gcm_crypt_1x: +.localentry aes_gcm_crypt_1x,0 + + cmpdi 5, 16 + bge __More_1x + blr +__More_1x: + li 10, 16 + divdu 12, 5, 10 + + xxlxor 32+15, 32+30, 0 + + # Pre-load 8 AES rounds to scratch vectors. + lxv 32+16, 16(6) # round key 1 + lxv 32+17, 32(6) # round key 2 + lxv 32+18, 48(6) # round key 3 + lxv 32+19, 64(6) # round key 4 + lxv 32+20, 80(6) # round key 5 + lxv 32+21, 96(6) # round key 6 + lxv 32+28, 112(6) # round key 7 + lxv 32+29, 128(6) # round key 8 + + lwz 23, 240(6) # n rounds + addi 22, 23, -9 # remaining AES rounds + cmpdi 12, 0 + bgt __Loop_1x + blr + +__Loop_1x: + mtctr 22 + addi 10, 6, 144 + vcipher 15, 15, 16 + vcipher 15, 15, 17 + vcipher 15, 15, 18 + vcipher 15, 15, 19 + vcipher 15, 15, 20 + vcipher 15, 15, 21 + vcipher 15, 15, 28 + vcipher 15, 15, 29 + +__Loop_aes_1state: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_1state + lxv 32+1, 0(10) # last round key + lxvb16x 11, 0, 14 # load input block + vcipherlast 15, 15, 1 - xxlor 32+29, 0, 0 - vxor 15, 30, 29 + xxlxor 32+15, 32+15, 11 + stxvb16x 32+15, 0, 9 # store output + addi 14, 14, 16 + addi 9, 9, 16 - cmpdi 9, 10 - beq .Loop_aes_gcm_8x + cmpdi 24, 0 # decrypt? + bne __Encrypt_1x + xxlor 15+32, 11, 11 +__Encrypt_1x: + vxor 15, 15, 0 + PPC_GHASH1x 0, 15 + addi 5, 5, -16 + addi 11, 11, 16 - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) + vadduwm 30, 30, 31 # IV + counter + xxlxor 32+15, 32+30, 0 + addi 12, 12, -1 + cmpdi 12, 0 + bgt __Loop_1x + + stxvb16x 32+0, 0, 8 # update Xi + blr +.size aes_gcm_crypt_1x,.-aes_gcm_crypt_1x + +################################################################################ +# Process a normal partial block when we come here. +# Compute partial mask, Load and store partial block to stack. +# Compute AES state. +# Compute ghash. +# +################################################################################ +.align 4 +__Process_partial: +.localentry __Process_partial,0 + + # create partial mask + vspltisb 16, -1 + li 12, 16 + sub 12, 12, 5 + sldi 12, 12, 3 + mtvsrdd 32+17, 0, 12 + vslo 16, 16, 17 # partial block mask + + lxvb16x 11, 0, 14 # load partial block + xxland 11, 11, 32+16 + + # AES crypt partial + xxlxor 32+15, 32+30, 0 + lwz 23, 240(6) # n rounds + addi 22, 23, -1 # loop - 1 + mtctr 22 + addi 10, 6, 16 + +__Loop_aes_pstate: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_pstate + lxv 32+1, 0(10) # last round key + vcipherlast 15, 15, 1 - cmpdi 9, 12 - beq .Loop_aes_gcm_8x + xxlxor 32+15, 32+15, 11 + vand 15, 15, 16 + # AES crypt output v15 + # Write partial + li 10, 224 + stxvb16x 15+32, 10, 1 # write v15 to stack + addi 10, 1, 223 + addi 12, 9, -1 + mtctr 5 # partial block len +__Write_partial: + lbzu 22, 1(10) + stbu 22, 1(12) + bdnz __Write_partial + + cmpdi 24, 0 # decrypt? + bne __Encrypt_partial + xxlor 32+15, 11, 11 # decrypt using the input block +__Encrypt_partial: + vxor 15, 15, 0 # ^ previous hash + PPC_GHASH1x 0, 15 + li 5, 0 # done last byte + stxvb16x 32+0, 0, 8 # Update X1 + blr +.size __Process_partial,.-__Process_partial + +################################################################################ +# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len, +# const char *rk, unsigned char iv[16], void *Xip); +# +# r3 - inp +# r4 - out +# r5 - len +# r6 - AES round keys +# r7 - iv +# r8 - Xi, HPoli, hash keys +# +# rounds is at offset 240 in rk +# Xi is at 0 in gcm_table (Xip). +# +################################################################################ +.global ppc_aes_gcm_encrypt +.align 5 +ppc_aes_gcm_encrypt: +.localentry ppc_aes_gcm_encrypt,0 - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq .Loop_aes_gcm_8x + SAVE_REGS + LOAD_HASH_TABLE - b aes_gcm_out + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 -.align 5 -.Loop_aes_gcm_8x: mr 14, 3 mr 9, 4 + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxv 10, 0(11) # vs10: vpermxor vector + li 11, 0 + lxv 0, 0(6) # round key 0 + + # + # Process different blocks + # + cmpdi 5, 128 + blt __Process_more_enc + + # load 9 round keys + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 23, 240(6) # n rounds + +__Process_encrypt: +# +# Process 8x AES/GCM blocks +# +__Process_8x_enc: + # 8x blocks li 10, 128 - divdu 10, 5, 10 - cmpdi 10, 0 - beq .Loop_last_block - - .long 0x13DEF8C0 - vxor 16, 30, 29 - .long 0x13DEF8C0 - vxor 17, 30, 29 - .long 0x13DEF8C0 - vxor 18, 30, 29 - .long 0x13DEF8C0 - vxor 19, 30, 29 - .long 0x13DEF8C0 - vxor 20, 30, 29 - .long 0x13DEF8C0 - vxor 21, 30, 29 - .long 0x13DEF8C0 - vxor 22, 30, 29 - - mtctr 10 + divdu 12, 5, 10 # n 128 bytes-blocks + + addi 12, 12, -1 # loop - 1 + + vmr 15, 30 # first state: IV + vadduwm 16, 15, 31 # state + counter + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + # vxor state, state, w # addroundkey + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 li 15, 16 li 16, 32 @@ -535,523 +591,185 @@ _ppc_aes_gcm_encrypt: li 20, 96 li 21, 112 - lwz 10, 240(6) - -.Loop_8x_block: - - lxvb16x 15, 0, 14 - lxvb16x 16, 15, 14 - lxvb16x 17, 16, 14 - lxvb16x 18, 17, 14 - lxvb16x 19, 18, 14 - lxvb16x 20, 19, 14 - lxvb16x 21, 20, 14 - lxvb16x 22, 21, 14 - addi 14, 14, 128 - -.Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_ghash - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_ghash - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_ghash - b aes_gcm_out - -Do_next_ghash: - - - - .long 0x11EFBD09 - .long 0x1210BD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 - - .long 0x1231BD09 - .long 0x1252BD09 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 - - .long 0x1273BD09 - .long 0x1294BD09 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 - - .long 0x12B5BD09 - .long 0x12D6BD09 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 - - addi 9, 9, 128 - - - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - .long 0x13DEF8C0 - vor 29,30,30 - vxor 15, 30, 27 - .long 0x13DEF8C0 - vxor 16, 30, 27 - .long 0x13DEF8C0 - vxor 17, 30, 27 - .long 0x13DEF8C0 - vxor 18, 30, 27 - .long 0x13DEF8C0 - vxor 19, 30, 27 - .long 0x13DEF8C0 - vxor 20, 30, 27 - .long 0x13DEF8C0 - vxor 21, 30, 27 - .long 0x13DEF8C0 - vxor 22, 30, 27 - - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz .Loop_8x_block - - vor 30,29,29 - -.Loop_last_block: - cmpdi 12, 0 - beq aes_gcm_out - - - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10, 240(6) - - cmpdi 12, 16 - blt Final_block - -.macro .Loop_aes_middle_1x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - .long 0x11EF9D08 - .long 0x11EFA508 - .long 0x11EFAD08 - .long 0x11EFB508 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - .long 0x11EF9D08 - .long 0x11EFA508 - .long 0x11EFAD08 - .long 0x11EFB508 - - xxlor 19+32, 9, 9 - .long 0x11EF9D08 -.endm - -Next_rem_block: - lxvb16x 15, 0, 14 - -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 + # + # Pre-compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + # + addi 22, 23, -9 # process 8 keys + mtctr 22 # AES key loop + addi 10, 6, 144 - cmpdi 10, 10 - beq Do_next_1x + LOOP_8AES_STATE # process 8 AES keys +__PreLoop_aes_state: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __PreLoop_aes_state + lxv 32+1, 0(10) # last round key (v1) - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x - -Do_next_1x: - .long 0x11EFBD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - addi 14, 14, 16 - addi 9, 9, 16 + cmpdi 12, 0 # Only one loop (8 block) + beq __Finish_ghash - vor 28,15,15 - ppc_update_hash_1x +# +# Loop 8x blocks and compute ghash +# +__Loop_8x_block_enc: + PROCESS_8X_AES_STATES - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - .long 0x13DEF8C0 - vxor 15, 30, 19 + # Compute ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x + + COMPUTE_STATES + + addi 5, 5, -128 + addi 11, 11, 128 + + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # Compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + LOOP_8AES_STATE # process 8 AES keys + mtctr 22 # AES key loop + addi 10, 6, 144 + +__LastLoop_aes_state: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __LastLoop_aes_state - bdnz Next_rem_block + lxv 32+1, 0(10) # last round key (v1) + addi 12, 12, -1 cmpdi 12, 0 - beq aes_gcm_out - -Final_block: -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x + bne __Loop_8x_block_enc + # + # Remainng blocks + # +__Finish_ghash: + PROCESS_8X_AES_STATES - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_final_1x - -Do_final_1x: - .long 0x11EFBD09 - - lxvb16x 15, 0, 14 - xxlxor 47, 47, 15 - - - li 15, 16 - sub 15, 15, 12 + # Compute ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x - vspltisb 16,-1 - vspltisb 17,0 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 + # Update IV and Xi + xxlor 30+32, 9, 9 # last ctr + vadduwm 30, 30, 31 # increase ctr + stxvb16x 32+0, 0, 8 # update Xi - addi 10, 1, 192 - lxvb16x 16, 15, 10 - xxland 47, 47, 16 + addi 5, 5, -128 + addi 11, 11, 128 - vor 28,15,15 - ppc_update_hash_1x + # + # Done 8x blocks + # + cmpdi 5, 0 + beq aes_gcm_out - bl Write_partial_block +__Process_more_enc: + li 24, 1 # encrypt + bl aes_gcm_crypt_1x + cmpdi 5, 0 + beq aes_gcm_out + bl __Process_partial b aes_gcm_out +.size ppc_aes_gcm_encrypt,.-ppc_aes_gcm_encrypt - - - - - -Write_partial_block: - li 10, 192 - stxvb16x 15+32, 10, 1 - - - addi 10, 9, -1 - addi 16, 1, 191 - - mtctr 12 - li 15, 0 - -Write_last_byte: - lbzu 14, 1(16) - stbu 14, 1(10) - bdnz Write_last_byte - blr - -aes_gcm_out: - - stxvb16x 32, 0, 8 - add 3, 11, 12 - - li 9, 256 - lvx 20, 9, 1 - addi 9, 9, 16 - lvx 21, 9, 1 - addi 9, 9, 16 - lvx 22, 9, 1 - addi 9, 9, 16 - lvx 23, 9, 1 - addi 9, 9, 16 - lvx 24, 9, 1 - addi 9, 9, 16 - lvx 25, 9, 1 - addi 9, 9, 16 - lvx 26, 9, 1 - addi 9, 9, 16 - lvx 27, 9, 1 - addi 9, 9, 16 - lvx 28, 9, 1 - addi 9, 9, 16 - lvx 29, 9, 1 - addi 9, 9, 16 - lvx 30, 9, 1 - addi 9, 9, 16 - lvx 31, 9, 1 - - ld 0, 528(1) - ld 14,112(1) - ld 15,120(1) - ld 16,128(1) - ld 17,136(1) - ld 18,144(1) - ld 19,152(1) - ld 20,160(1) - ld 21,168(1) - - mtlr 0 - addi 1, 1, 512 - blr - - - - -.global ppc_aes_gcm_decrypt -.align 5 +################################################################################ +# ppc_aes_gcm_decrypt (const void *inp, void *out, size_t len, +# const char *rk, unsigned char iv[16], void *Xip); +# 8x Decrypt +# +################################################################################ +.global ppc_aes_gcm_decrypt +.align 5 ppc_aes_gcm_decrypt: -_ppc_aes_gcm_decrypt: - - stdu 1,-512(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - std 0, 528(1) - - - lxvb16x 32, 0, 8 - - - li 10, 32 - lxvd2x 2+32, 10, 8 - li 10, 48 - lxvd2x 3+32, 10, 8 - li 10, 64 - lxvd2x 4+32, 10, 8 - li 10, 80 - lxvd2x 5+32, 10, 8 - - li 10, 96 - lxvd2x 6+32, 10, 8 - li 10, 112 - lxvd2x 7+32, 10, 8 - li 10, 128 - lxvd2x 8+32, 10, 8 - - li 10, 144 - lxvd2x 9+32, 10, 8 - li 10, 160 - lxvd2x 10+32, 10, 8 - li 10, 176 - lxvd2x 11+32, 10, 8 - - li 10, 192 - lxvd2x 12+32, 10, 8 - li 10, 208 - lxvd2x 13+32, 10, 8 - li 10, 224 - lxvd2x 14+32, 10, 8 - - - lxvb16x 30+32, 0, 7 - - mr 12, 5 - li 11, 0 - +.localentry ppc_aes_gcm_decrypt, 0 - vxor 31, 31, 31 - vspltisb 22,1 - vsldoi 31, 31, 22,1 - - - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - - lwz 9,240(6) - - - - xxlor 32+29, 0, 0 - vxor 15, 30, 29 - - cmpdi 9, 10 - beq .Loop_aes_gcm_8x_dec - - - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) - - cmpdi 9, 12 - beq .Loop_aes_gcm_8x_dec + SAVE_REGS + LOAD_HASH_TABLE + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq .Loop_aes_gcm_8x_dec - - b aes_gcm_out - -.align 5 -.Loop_aes_gcm_8x_dec: mr 14, 3 mr 9, 4 + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxv 10, 0(11) # vs10: vpermxor vector + li 11, 0 + lxv 0, 0(6) # round key 0 + + # + # Process different blocks + # + cmpdi 5, 128 + blt __Process_more_dec + + # load 9 round keys + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 23, 240(6) # n rounds + +__Process_decrypt: +# +# Process 8x AES/GCM blocks +# +__Process_8x_dec: + # 8x blocks li 10, 128 - divdu 10, 5, 10 - cmpdi 10, 0 - beq .Loop_last_block_dec - - .long 0x13DEF8C0 - vxor 16, 30, 29 - .long 0x13DEF8C0 - vxor 17, 30, 29 - .long 0x13DEF8C0 - vxor 18, 30, 29 - .long 0x13DEF8C0 - vxor 19, 30, 29 - .long 0x13DEF8C0 - vxor 20, 30, 29 - .long 0x13DEF8C0 - vxor 21, 30, 29 - .long 0x13DEF8C0 - vxor 22, 30, 29 - - mtctr 10 + divdu 12, 5, 10 # n 128 bytes-blocks + + addi 12, 12, -1 # loop - 1 + + vmr 15, 30 # first state: IV + vadduwm 16, 15, 31 # state + counter + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + # vxor state, state, w # addroundkey + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 li 15, 16 li 16, 32 @@ -1061,279 +779,215 @@ _ppc_aes_gcm_decrypt: li 20, 96 li 21, 112 - lwz 10, 240(6) + # + # Pre-compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + # + addi 22, 23, -9 # process 8 keys + mtctr 22 # AES key loop + addi 10, 6, 144 -.Loop_8x_block_dec: + LOOP_8AES_STATE # process 8 AES keys - lxvb16x 15, 0, 14 - lxvb16x 16, 15, 14 - lxvb16x 17, 16, 14 - lxvb16x 18, 17, 14 - lxvb16x 19, 18, 14 - lxvb16x 20, 19, 14 - lxvb16x 21, 20, 14 - lxvb16x 22, 21, 14 +__PreLoop_aes_state_dec: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __PreLoop_aes_state_dec + lxv 32+1, 0(10) # last round key (v1) + + cmpdi 12, 0 # Only one loop (8 block) + beq __Finish_ghash_dec + +# +# Loop 8x blocks and compute ghash +# +__Loop_8x_block_dec: + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block addi 14, 14, 128 -.Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_last_aes_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_last_aes_dec - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_last_aes_dec - b aes_gcm_out - -Do_last_aes_dec: - - - - .long 0x11EFBD09 - .long 0x1210BD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 - - .long 0x1231BD09 - .long 0x1252BD09 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 - - .long 0x1273BD09 - .long 0x1294BD09 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 - - .long 0x12B5BD09 - .long 0x12D6BD09 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output addi 9, 9, 128 - xxlor 15+32, 15, 15 - xxlor 16+32, 16, 16 - xxlor 17+32, 17, 17 - xxlor 18+32, 18, 18 - xxlor 19+32, 19, 19 - xxlor 20+32, 20, 20 - xxlor 21+32, 21, 21 - xxlor 22+32, 22, 22 - - - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - .long 0x13DEF8C0 - vor 29,30,30 - vxor 15, 30, 27 - .long 0x13DEF8C0 - vxor 16, 30, 27 - .long 0x13DEF8C0 - vxor 17, 30, 27 - .long 0x13DEF8C0 - vxor 18, 30, 27 - .long 0x13DEF8C0 - vxor 19, 30, 27 - .long 0x13DEF8C0 - vxor 20, 30, 27 - .long 0x13DEF8C0 - vxor 21, 30, 27 - .long 0x13DEF8C0 - vxor 22, 30, 27 - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz .Loop_8x_block_dec - - vor 30,29,29 - -.Loop_last_block_dec: - cmpdi 12, 0 - beq aes_gcm_out - - - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10,240(6) - - cmpdi 12, 16 - blt Final_block_dec - -Next_rem_block_dec: - lxvb16x 15, 0, 14 - -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_1x_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x_dec - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x_dec - -Do_next_1x_dec: - .long 0x11EFBD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - addi 14, 14, 16 - addi 9, 9, 16 - - xxlor 28+32, 15, 15 - ppc_update_hash_1x + vmr 15, 23 + vmr 16, 24 + vmr 17, 25 + vmr 18, 26 + vmr 19, 27 + vmr 20, 28 + vmr 21, 29 + vmr 22, 30 - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - .long 0x13DEF8C0 - vxor 15, 30, 19 + # ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x + + xxlor 32+15, 9, 9 # last state + vadduwm 15, 15, 31 # state + counter + vadduwm 16, 15, 31 + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + xxlor 32+27, 0, 0 # restore roundkey 0 + vxor 15, 15, 27 # IV + round key - add round key 0 + vxor 16, 16, 27 + vxor 17, 17, 27 + vxor 18, 18, 27 + vxor 19, 19, 27 + vxor 20, 20, 27 + vxor 21, 21, 27 + vxor 22, 22, 27 - bdnz Next_rem_block_dec + addi 5, 5, -128 + addi 11, 11, 128 + + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + LOOP_8AES_STATE # process 8 AES keys + mtctr 22 # AES key loop + addi 10, 6, 144 +__LastLoop_aes_state_dec: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __LastLoop_aes_state_dec + lxv 32+1, 0(10) # last round key (v1) + addi 12, 12, -1 cmpdi 12, 0 - beq aes_gcm_out - -Final_block_dec: -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x_dec - + bne __Loop_8x_block_dec + +__Finish_ghash_dec: + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block + addi 14, 14, 128 - xxlor 24+32, 13, 13 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output + addi 9, 9, 128 - .long 0x11EFBD08 - .long 0x11EFC508 + vxor 15, 23, 0 + vmr 16, 24 + vmr 17, 25 + vmr 18, 26 + vmr 19, 27 + vmr 20, 28 + vmr 21, 29 + vmr 22, 30 - xxlor 23+32, 14, 14 + #vxor 15, 15, 0 + PPC_GFMUL128_8x - cmpdi 10, 14 - beq Do_final_1x_dec + xxlor 30+32, 9, 9 # last ctr + vadduwm 30, 30, 31 # increase ctr + stxvb16x 32+0, 0, 8 # update Xi -Do_final_1x_dec: - .long 0x11EFBD09 + addi 5, 5, -128 + addi 11, 11, 128 - lxvb16x 15, 0, 14 - xxlxor 47, 47, 15 + # + # Done 8x blocks + # + cmpdi 5, 0 + beq aes_gcm_out - li 15, 16 - sub 15, 15, 12 +__Process_more_dec: + li 24, 0 # decrypt + bl aes_gcm_crypt_1x + cmpdi 5, 0 + beq aes_gcm_out - vspltisb 16,-1 - vspltisb 17,0 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 - - addi 10, 1, 192 - lxvb16x 16, 15, 10 - xxland 47, 47, 16 + bl __Process_partial + b aes_gcm_out +.size ppc_aes_gcm_decrypt,.-ppc_aes_gcm_decrypt - xxlor 28+32, 15, 15 - ppc_update_hash_1x +aes_gcm_out: +.localentry aes_gcm_out,0 + mr 3, 11 # return count - bl Write_partial_block + RESTORE_REGS + blr +.size aes_gcm_out,.-aes_gcm_out - b aes_gcm_out +.rodata +.align 4 +# for vector permute and xor +permx: +.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3 diff --git a/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S b/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S index 2ff143c42ab7..51cfac7e45fc 100644 --- a/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S +++ b/sys/crypto/openssl/powerpc64/aes-gcm-ppc.S @@ -1,532 +1,587 @@ /* Do not modify. This file is auto-generated from aes-gcm-ppc.pl. */ -.machine "any" -.abiversion 2 +.machine "any" .text - - - - -.macro .Loop_aes_middle4x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - .long 0x11EF9D08 - .long 0x12109D08 - .long 0x12319D08 - .long 0x12529D08 - - .long 0x11EFA508 - .long 0x1210A508 - .long 0x1231A508 - .long 0x1252A508 - - .long 0x11EFAD08 - .long 0x1210AD08 - .long 0x1231AD08 - .long 0x1252AD08 - - .long 0x11EFB508 - .long 0x1210B508 - .long 0x1231B508 - .long 0x1252B508 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - .long 0x11EF9D08 - .long 0x12109D08 - .long 0x12319D08 - .long 0x12529D08 - - .long 0x11EFA508 - .long 0x1210A508 - .long 0x1231A508 - .long 0x1252A508 - - .long 0x11EFAD08 - .long 0x1210AD08 - .long 0x1231AD08 - .long 0x1252AD08 - - .long 0x11EFB508 - .long 0x1210B508 - .long 0x1231B508 - .long 0x1252B508 - - xxlor 23+32, 9, 9 - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 -.endm - - - - - -.macro .Loop_aes_middle8x - xxlor 23+32, 1, 1 - xxlor 24+32, 2, 2 - xxlor 25+32, 3, 3 - xxlor 26+32, 4, 4 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - - xxlor 23+32, 5, 5 - xxlor 24+32, 6, 6 - xxlor 25+32, 7, 7 - xxlor 26+32, 8, 8 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - - xxlor 23+32, 9, 9 - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 -.endm - - - - -ppc_aes_gcm_ghash: - vxor 15, 15, 0 - - xxlxor 29, 29, 29 - - .long 0x12EC7CC8 - .long 0x130984C8 - .long 0x13268CC8 - .long 0x134394C8 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 - - .long 0x130D7CC8 - .long 0x132A84C8 - .long 0x13478CC8 - .long 0x136494C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 - - .long 0x130E7CC8 - .long 0x132B84C8 - .long 0x13488CC8 - .long 0x136594C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 - - blr - - - - - -.macro ppc_aes_gcm_ghash2_4x - - vxor 15, 15, 0 - - xxlxor 29, 29, 29 - - .long 0x12EC7CC8 - .long 0x130984C8 - .long 0x13268CC8 - .long 0x134394C8 +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-512(1) + + std 14, 112(1) + std 15, 120(1) + std 16, 128(1) + std 17, 136(1) + std 18, 144(1) + std 19, 152(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + std 24, 192(1) + + stxv 32+20, 256(1) + stxv 32+21, 256+16(1) + stxv 32+22, 256+32(1) + stxv 32+23, 256+48(1) + stxv 32+24, 256+64(1) + stxv 32+25, 256+80(1) + stxv 32+26, 256+96(1) + stxv 32+27, 256+112(1) + stxv 32+28, 256+128(1) + stxv 32+29, 256+144(1) + stxv 32+30, 256+160(1) + stxv 32+31, 256+176(1) +.endm # SAVE_REGS + +.macro RESTORE_REGS + lxv 32+20, 256(1) + lxv 32+21, 256+16(1) + lxv 32+22, 256+32(1) + lxv 32+23, 256+48(1) + lxv 32+24, 256+64(1) + lxv 32+25, 256+80(1) + lxv 32+26, 256+96(1) + lxv 32+27, 256+112(1) + lxv 32+28, 256+128(1) + lxv 32+29, 256+144(1) + lxv 32+30, 256+160(1) + lxv 32+31, 256+176(1) + + ld 14, 112(1) + ld 15, 120(1) + ld 16, 128(1) + ld 17, 136(1) + ld 18, 144(1) + ld 19, 152(1) + ld 20, 160(1) + ld 21, 168(1) + ld 22, 176(1) + ld 23, 184(1) + ld 24, 192(1) + + addi 1, 1, 512 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# 4x loops +.macro AES_CIPHER_4x r + vcipher 15, 15, \r + vcipher 16, 16, \r + vcipher 17, 17, \r + vcipher 18, 18, \r +.endm + +# 8x loops +.macro AES_CIPHER_8x r + vcipher 15, 15, \r + vcipher 16, 16, \r + vcipher 17, 17, \r + vcipher 18, 18, \r + vcipher 19, 19, \r + vcipher 20, 20, \r + vcipher 21, 21, \r + vcipher 22, 22, \r +.endm + +.macro LOOP_8AES_STATE + AES_CIPHER_8x 23 + AES_CIPHER_8x 24 + AES_CIPHER_8x 25 + AES_CIPHER_8x 26 + AES_CIPHER_8x 27 + AES_CIPHER_8x 28 + AES_CIPHER_8x 29 + AES_CIPHER_8x 1 +.endm + +# +# PPC_GFMUL128_8x: Compute hash values of 8 blocks based on Karatsuba method. +# +# S1 should xor with the previous digest +# +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# vs10: vpermxor vector +# Scratch: v23 - v29 +# +.macro PPC_GFMUL128_8x + + vpmsumd 23, 12, 15 # H4.L * X.L + vpmsumd 24, 9, 16 + vpmsumd 25, 6, 17 + vpmsumd 26, 3, 18 vxor 23, 23, 24 vxor 23, 23, 25 - vxor 23, 23, 26 - - .long 0x130D7CC8 - .long 0x132A84C8 - .long 0x13478CC8 - .long 0x136494C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - - vxor 24, 24, 27 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 + vxor 23, 23, 26 # L - .long 0x130E7CC8 - .long 0x132B84C8 - .long 0x13488CC8 - .long 0x136594C8 + vpmsumd 27, 13, 15 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, 16 # H3.L * X1.H + H3.H * X1.L + vpmsumd 25, 7, 17 + vpmsumd 26, 4, 18 + vxor 24, 27, 28 vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 27, 23, 27 - - - .long 0x1309A4C8 - .long 0x1326ACC8 - .long 0x1343B4C8 - vxor 19, 19, 27 - .long 0x12EC9CC8 + vxor 24, 24, 26 # M + + vpmsumd 26, 14, 15 # H4.H * X.H + vpmsumd 27, 11, 16 + vpmsumd 28, 8, 17 + vpmsumd 29, 5, 18 + + vxor 26, 26, 27 + vxor 26, 26, 28 + vxor 26, 26, 29 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L + + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+29, 10, 10 + vpermxor 23, 23, 28, 29 + + vxor 24, 26, 1 # H + + # sum hash and reduction with H Poly + # + # vsldoi 25, 23, 23, 8 # swap + # vpmsumd 23, 23, 2 + # vxor 27, 25, 24 + # + vpermxor 27, 23, 24, 29 + vpmsumd 23, 23, 2 + vxor 0, 23, 27 # Digest of 4 blocks + + vxor 19, 19, 0 + + # Compute digest for the next 4 blocks + vpmsumd 24, 9, 20 + vpmsumd 25, 6, 21 + vpmsumd 26, 3, 22 + vpmsumd 23, 12, 19 # H4.L * X.L vxor 23, 23, 24 vxor 23, 23, 25 - vxor 23, 23, 26 + vxor 23, 23, 26 # L - .long 0x130D9CC8 - .long 0x132AA4C8 - .long 0x1347ACC8 - .long 0x1364B4C8 + vpmsumd 27, 13, 19 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, 20 # H3.L * X1.H + H3.H * X1.L + vpmsumd 25, 7, 21 + vpmsumd 26, 4, 22 + vxor 24, 27, 28 vxor 24, 24, 25 - vxor 24, 24, 26 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - - vxor 24, 24, 27 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 - - .long 0x130E9CC8 - .long 0x132BA4C8 - .long 0x1348ACC8 - .long 0x1365B4C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 - -.endm - - - - -.macro ppc_update_hash_1x - vxor 28, 28, 0 - - vxor 19, 19, 19 - - .long 0x12C3E4C8 - .long 0x12E4E4C8 - .long 0x1305E4C8 - - .long 0x137614C8 - - vsldoi 25, 23, 19, 8 - vsldoi 26, 19, 23, 8 - vxor 22, 22, 25 - vxor 24, 24, 26 - - vsldoi 22, 22, 22, 8 - vxor 22, 22, 27 - - vsldoi 20, 22, 22, 8 - .long 0x12D614C8 - vxor 20, 20, 24 - vxor 22, 22, 20 - - vor 0,22,22 - -.endm - - - - - - - - - - - - - -.global ppc_aes_gcm_encrypt -.align 5 -ppc_aes_gcm_encrypt: -_ppc_aes_gcm_encrypt: - - stdu 1,-512(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - std 0, 528(1) - - - lxvb16x 32, 0, 8 - + vxor 24, 24, 26 # M + + vpmsumd 26, 14, 19 # H4.H * X.H + vpmsumd 27, 11, 20 + vpmsumd 28, 8, 21 + vpmsumd 29, 5, 22 + + vxor 26, 26, 27 + vxor 26, 26, 28 + vxor 26, 26, 29 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L + + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+29, 10, 10 + vpermxor 23, 23, 28, 29 + + vxor 24, 26, 1 # H + + # sum hash and reduction with H Poly + # + # vsldoi 25, 23, 23, 8 # swap + # vpmsumd 23, 23, 2 + # vxor 27, 25, 24 + # + vpermxor 27, 23, 24, 29 + vpmsumd 23, 23, 2 + vxor 0, 23, 27 # Digest of 8 blocks +.endm + +# +# Compute update single ghash +# vs10: vpermxor vector +# scratch: v1, v22..v27 +# +.macro PPC_GHASH1x H S1 + + vxor 1, 1, 1 + + vpmsumd 22, 3, \S1 # L + vpmsumd 23, 4, \S1 # M + vpmsumd 24, 5, \S1 # H + + vpmsumd 27, 22, 2 # reduction + + vsldoi 25, 23, 1, 8 # mL + vsldoi 26, 1, 23, 8 # mH + vxor 22, 22, 25 # LL + LL + vxor 24, 24, 26 # HH + HH + + xxlor 32+25, 10, 10 + vpermxor 22, 22, 27, 25 + + # vsldoi 23, 22, 22, 8 # swap + # vpmsumd 22, 22, 2 # reduction + # vxor 23, 23, 24 + vpermxor 23, 22, 24, 25 + vpmsumd 22, 22, 2 # reduction + + vxor \H, 22, 23 +.endm + +# +# LOAD_HASH_TABLE +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# +.macro LOAD_HASH_TABLE + # Load Xi + lxvb16x 32, 0, 8 # load Xi + + vxor 1, 1, 1 li 10, 32 - lxvd2x 2+32, 10, 8 - li 10, 48 - lxvd2x 3+32, 10, 8 - li 10, 64 - lxvd2x 4+32, 10, 8 - li 10, 80 - lxvd2x 5+32, 10, 8 + lxvd2x 2+32, 10, 8 # H Poli - li 10, 96 - lxvd2x 6+32, 10, 8 + # load Hash - h^4, h^3, h^2, h + li 10, 64 + lxvd2x 4+32, 10, 8 # H + vsldoi 3, 1, 4, 8 # l + vsldoi 5, 4, 1, 8 # h li 10, 112 - lxvd2x 7+32, 10, 8 - li 10, 128 - lxvd2x 8+32, 10, 8 - - li 10, 144 - lxvd2x 9+32, 10, 8 + lxvd2x 7+32, 10, 8 # H^2 + vsldoi 6, 1, 7, 8 # l + vsldoi 8, 7, 1, 8 # h li 10, 160 - lxvd2x 10+32, 10, 8 - li 10, 176 - lxvd2x 11+32, 10, 8 - - li 10, 192 - lxvd2x 12+32, 10, 8 + lxvd2x 10+32, 10, 8 # H^3 + vsldoi 9, 1, 10, 8 # l + vsldoi 11, 10, 1, 8 # h li 10, 208 - lxvd2x 13+32, 10, 8 - li 10, 224 - lxvd2x 14+32, 10, 8 - - - lxvb16x 30+32, 0, 7 - - mr 12, 5 - li 11, 0 - - - vxor 31, 31, 31 - vspltisb 22,1 - vsldoi 31, 31, 22,1 - - - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - - lwz 9,240(6) + lxvd2x 13+32, 10, 8 # H^4 + vsldoi 12, 1, 13, 8 # l + vsldoi 14, 13, 1, 8 # h +.endm + +.macro PROCESS_8X_AES_STATES + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block + addi 14, 14, 128 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output + addi 9, 9, 128 +.endm + +.macro COMPUTE_STATES + xxlor 32+15, 9, 9 # last state + vadduwm 15, 15, 31 # state + counter + vadduwm 16, 15, 31 + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 +.endm + +################################################################################ +# Compute AES and ghash one block at a time. +# r23: AES rounds +# v30: current IV +# vs0: roundkey 0 +# +################################################################################ +.align 4 +aes_gcm_crypt_1x: +.localentry aes_gcm_crypt_1x,0 + + cmpdi 5, 16 + bge __More_1x + blr +__More_1x: + li 10, 16 + divdu 12, 5, 10 + + xxlxor 32+15, 32+30, 0 + + # Pre-load 8 AES rounds to scratch vectors. + lxv 32+16, 16(6) # round key 1 + lxv 32+17, 32(6) # round key 2 + lxv 32+18, 48(6) # round key 3 + lxv 32+19, 64(6) # round key 4 + lxv 32+20, 80(6) # round key 5 + lxv 32+21, 96(6) # round key 6 + lxv 32+28, 112(6) # round key 7 + lxv 32+29, 128(6) # round key 8 + + lwz 23, 240(6) # n rounds + addi 22, 23, -9 # remaining AES rounds + cmpdi 12, 0 + bgt __Loop_1x + blr + +__Loop_1x: + mtctr 22 + addi 10, 6, 144 + vcipher 15, 15, 16 + vcipher 15, 15, 17 + vcipher 15, 15, 18 + vcipher 15, 15, 19 + vcipher 15, 15, 20 + vcipher 15, 15, 21 + vcipher 15, 15, 28 + vcipher 15, 15, 29 + +__Loop_aes_1state: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_1state + lxv 32+1, 0(10) # last round key + lxvb16x 11, 0, 14 # load input block + vcipherlast 15, 15, 1 - xxlor 32+29, 0, 0 - vxor 15, 30, 29 + xxlxor 32+15, 32+15, 11 + stxvb16x 32+15, 0, 9 # store output + addi 14, 14, 16 + addi 9, 9, 16 - cmpdi 9, 10 - beq .Loop_aes_gcm_8x + cmpdi 24, 0 # decrypt? + bne __Encrypt_1x + xxlor 15+32, 11, 11 +__Encrypt_1x: + vxor 15, 15, 0 + PPC_GHASH1x 0, 15 + addi 5, 5, -16 + addi 11, 11, 16 - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) + vadduwm 30, 30, 31 # IV + counter + xxlxor 32+15, 32+30, 0 + addi 12, 12, -1 + cmpdi 12, 0 + bgt __Loop_1x + + stxvb16x 32+0, 0, 8 # update Xi + blr +.size aes_gcm_crypt_1x,.-aes_gcm_crypt_1x + +################################################################################ +# Process a normal partial block when we come here. +# Compute partial mask, Load and store partial block to stack. +# Compute AES state. +# Compute ghash. +# +################################################################################ +.align 4 +__Process_partial: +.localentry __Process_partial,0 + + # create partial mask + vspltisb 16, -1 + li 12, 16 + sub 12, 12, 5 + sldi 12, 12, 3 + mtvsrdd 32+17, 0, 12 + vslo 16, 16, 17 # partial block mask + + lxvb16x 11, 0, 14 # load partial block + xxland 11, 11, 32+16 + + # AES crypt partial + xxlxor 32+15, 32+30, 0 + lwz 23, 240(6) # n rounds + addi 22, 23, -1 # loop - 1 + mtctr 22 + addi 10, 6, 16 + +__Loop_aes_pstate: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_pstate + lxv 32+1, 0(10) # last round key + vcipherlast 15, 15, 1 - cmpdi 9, 12 - beq .Loop_aes_gcm_8x + xxlxor 32+15, 32+15, 11 + vand 15, 15, 16 + # AES crypt output v15 + # Write partial + li 10, 224 + stxvb16x 15+32, 10, 1 # write v15 to stack + addi 10, 1, 223 + addi 12, 9, -1 + mtctr 5 # partial block len +__Write_partial: + lbzu 22, 1(10) + stbu 22, 1(12) + bdnz __Write_partial + + cmpdi 24, 0 # decrypt? + bne __Encrypt_partial + xxlor 32+15, 11, 11 # decrypt using the input block +__Encrypt_partial: + vxor 15, 15, 0 # ^ previous hash + PPC_GHASH1x 0, 15 + li 5, 0 # done last byte + stxvb16x 32+0, 0, 8 # Update X1 + blr +.size __Process_partial,.-__Process_partial + +################################################################################ +# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len, +# const char *rk, unsigned char iv[16], void *Xip); +# +# r3 - inp +# r4 - out +# r5 - len +# r6 - AES round keys +# r7 - iv +# r8 - Xi, HPoli, hash keys +# +# rounds is at offset 240 in rk +# Xi is at 0 in gcm_table (Xip). +# +################################################################################ +.global ppc_aes_gcm_encrypt +.align 5 +ppc_aes_gcm_encrypt: +.localentry ppc_aes_gcm_encrypt,0 - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq .Loop_aes_gcm_8x + SAVE_REGS + LOAD_HASH_TABLE - b aes_gcm_out + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 -.align 5 -.Loop_aes_gcm_8x: mr 14, 3 mr 9, 4 + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxv 10, 0(11) # vs10: vpermxor vector + li 11, 0 + lxv 0, 0(6) # round key 0 + + # + # Process different blocks + # + cmpdi 5, 128 + blt __Process_more_enc + + # load 9 round keys + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 23, 240(6) # n rounds + +__Process_encrypt: +# +# Process 8x AES/GCM blocks +# +__Process_8x_enc: + # 8x blocks li 10, 128 - divdu 10, 5, 10 - cmpdi 10, 0 - beq .Loop_last_block - - .long 0x13DEF8C0 - vxor 16, 30, 29 - .long 0x13DEF8C0 - vxor 17, 30, 29 - .long 0x13DEF8C0 - vxor 18, 30, 29 - .long 0x13DEF8C0 - vxor 19, 30, 29 - .long 0x13DEF8C0 - vxor 20, 30, 29 - .long 0x13DEF8C0 - vxor 21, 30, 29 - .long 0x13DEF8C0 - vxor 22, 30, 29 - - mtctr 10 + divdu 12, 5, 10 # n 128 bytes-blocks + + addi 12, 12, -1 # loop - 1 + + vmr 15, 30 # first state: IV + vadduwm 16, 15, 31 # state + counter + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + # vxor state, state, w # addroundkey + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 li 15, 16 li 16, 32 @@ -536,523 +591,185 @@ _ppc_aes_gcm_encrypt: li 20, 96 li 21, 112 - lwz 10, 240(6) - -.Loop_8x_block: - - lxvb16x 15, 0, 14 - lxvb16x 16, 15, 14 - lxvb16x 17, 16, 14 - lxvb16x 18, 17, 14 - lxvb16x 19, 18, 14 - lxvb16x 20, 19, 14 - lxvb16x 21, 20, 14 - lxvb16x 22, 21, 14 - addi 14, 14, 128 - -.Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_ghash - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_ghash - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_ghash - b aes_gcm_out - -Do_next_ghash: - - - - .long 0x11EFBD09 - .long 0x1210BD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 - - .long 0x1231BD09 - .long 0x1252BD09 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 - - .long 0x1273BD09 - .long 0x1294BD09 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 - - .long 0x12B5BD09 - .long 0x12D6BD09 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 - - addi 9, 9, 128 - - - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - .long 0x13DEF8C0 - vor 29,30,30 - vxor 15, 30, 27 - .long 0x13DEF8C0 - vxor 16, 30, 27 - .long 0x13DEF8C0 - vxor 17, 30, 27 - .long 0x13DEF8C0 - vxor 18, 30, 27 - .long 0x13DEF8C0 - vxor 19, 30, 27 - .long 0x13DEF8C0 - vxor 20, 30, 27 - .long 0x13DEF8C0 - vxor 21, 30, 27 - .long 0x13DEF8C0 - vxor 22, 30, 27 - - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz .Loop_8x_block - - vor 30,29,29 - -.Loop_last_block: - cmpdi 12, 0 - beq aes_gcm_out - - - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10, 240(6) - - cmpdi 12, 16 - blt Final_block - -.macro .Loop_aes_middle_1x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - .long 0x11EF9D08 - .long 0x11EFA508 - .long 0x11EFAD08 - .long 0x11EFB508 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - .long 0x11EF9D08 - .long 0x11EFA508 - .long 0x11EFAD08 - .long 0x11EFB508 - - xxlor 19+32, 9, 9 - .long 0x11EF9D08 -.endm - -Next_rem_block: - lxvb16x 15, 0, 14 - -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 + # + # Pre-compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + # + addi 22, 23, -9 # process 8 keys + mtctr 22 # AES key loop + addi 10, 6, 144 - cmpdi 10, 10 - beq Do_next_1x + LOOP_8AES_STATE # process 8 AES keys +__PreLoop_aes_state: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __PreLoop_aes_state + lxv 32+1, 0(10) # last round key (v1) - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x - -Do_next_1x: - .long 0x11EFBD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - addi 14, 14, 16 - addi 9, 9, 16 + cmpdi 12, 0 # Only one loop (8 block) + beq __Finish_ghash - vor 28,15,15 - ppc_update_hash_1x +# +# Loop 8x blocks and compute ghash +# +__Loop_8x_block_enc: + PROCESS_8X_AES_STATES - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - .long 0x13DEF8C0 - vxor 15, 30, 19 + # Compute ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x + + COMPUTE_STATES + + addi 5, 5, -128 + addi 11, 11, 128 + + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # Compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + LOOP_8AES_STATE # process 8 AES keys + mtctr 22 # AES key loop + addi 10, 6, 144 + +__LastLoop_aes_state: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __LastLoop_aes_state - bdnz Next_rem_block + lxv 32+1, 0(10) # last round key (v1) + addi 12, 12, -1 cmpdi 12, 0 - beq aes_gcm_out - -Final_block: -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x + bne __Loop_8x_block_enc + # + # Remainng blocks + # +__Finish_ghash: + PROCESS_8X_AES_STATES - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_final_1x - -Do_final_1x: - .long 0x11EFBD09 - - lxvb16x 15, 0, 14 - xxlxor 47, 47, 15 - - - li 15, 16 - sub 15, 15, 12 + # Compute ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x - vspltisb 16,-1 - vspltisb 17,0 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 + # Update IV and Xi + xxlor 30+32, 9, 9 # last ctr + vadduwm 30, 30, 31 # increase ctr + stxvb16x 32+0, 0, 8 # update Xi - addi 10, 1, 192 - lxvb16x 16, 15, 10 - xxland 47, 47, 16 + addi 5, 5, -128 + addi 11, 11, 128 - vor 28,15,15 - ppc_update_hash_1x + # + # Done 8x blocks + # + cmpdi 5, 0 + beq aes_gcm_out - bl Write_partial_block +__Process_more_enc: + li 24, 1 # encrypt + bl aes_gcm_crypt_1x + cmpdi 5, 0 + beq aes_gcm_out + bl __Process_partial b aes_gcm_out +.size ppc_aes_gcm_encrypt,.-ppc_aes_gcm_encrypt - - - - - -Write_partial_block: - li 10, 192 - stxvb16x 15+32, 10, 1 - - - addi 10, 9, -1 - addi 16, 1, 191 - - mtctr 12 - li 15, 0 - -Write_last_byte: - lbzu 14, 1(16) - stbu 14, 1(10) - bdnz Write_last_byte - blr - -aes_gcm_out: - - stxvb16x 32, 0, 8 - add 3, 11, 12 - - li 9, 256 - lvx 20, 9, 1 - addi 9, 9, 16 - lvx 21, 9, 1 - addi 9, 9, 16 - lvx 22, 9, 1 - addi 9, 9, 16 - lvx 23, 9, 1 - addi 9, 9, 16 - lvx 24, 9, 1 - addi 9, 9, 16 - lvx 25, 9, 1 - addi 9, 9, 16 - lvx 26, 9, 1 - addi 9, 9, 16 - lvx 27, 9, 1 - addi 9, 9, 16 - lvx 28, 9, 1 - addi 9, 9, 16 - lvx 29, 9, 1 - addi 9, 9, 16 - lvx 30, 9, 1 - addi 9, 9, 16 - lvx 31, 9, 1 - - ld 0, 528(1) - ld 14,112(1) - ld 15,120(1) - ld 16,128(1) - ld 17,136(1) - ld 18,144(1) - ld 19,152(1) - ld 20,160(1) - ld 21,168(1) - - mtlr 0 - addi 1, 1, 512 - blr - - - - -.global ppc_aes_gcm_decrypt -.align 5 +################################################################################ +# ppc_aes_gcm_decrypt (const void *inp, void *out, size_t len, +# const char *rk, unsigned char iv[16], void *Xip); +# 8x Decrypt +# +################################################################################ +.global ppc_aes_gcm_decrypt +.align 5 ppc_aes_gcm_decrypt: -_ppc_aes_gcm_decrypt: - - stdu 1,-512(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - std 0, 528(1) - - - lxvb16x 32, 0, 8 - - - li 10, 32 - lxvd2x 2+32, 10, 8 - li 10, 48 - lxvd2x 3+32, 10, 8 - li 10, 64 - lxvd2x 4+32, 10, 8 - li 10, 80 - lxvd2x 5+32, 10, 8 - - li 10, 96 - lxvd2x 6+32, 10, 8 - li 10, 112 - lxvd2x 7+32, 10, 8 - li 10, 128 - lxvd2x 8+32, 10, 8 - - li 10, 144 - lxvd2x 9+32, 10, 8 - li 10, 160 - lxvd2x 10+32, 10, 8 - li 10, 176 - lxvd2x 11+32, 10, 8 - - li 10, 192 - lxvd2x 12+32, 10, 8 - li 10, 208 - lxvd2x 13+32, 10, 8 - li 10, 224 - lxvd2x 14+32, 10, 8 - - - lxvb16x 30+32, 0, 7 - - mr 12, 5 - li 11, 0 - +.localentry ppc_aes_gcm_decrypt, 0 - vxor 31, 31, 31 - vspltisb 22,1 - vsldoi 31, 31, 22,1 - - - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - - lwz 9,240(6) - - - - xxlor 32+29, 0, 0 - vxor 15, 30, 29 - - cmpdi 9, 10 - beq .Loop_aes_gcm_8x_dec - - - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) - - cmpdi 9, 12 - beq .Loop_aes_gcm_8x_dec + SAVE_REGS + LOAD_HASH_TABLE + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq .Loop_aes_gcm_8x_dec - - b aes_gcm_out - -.align 5 -.Loop_aes_gcm_8x_dec: mr 14, 3 mr 9, 4 + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxv 10, 0(11) # vs10: vpermxor vector + li 11, 0 + lxv 0, 0(6) # round key 0 + + # + # Process different blocks + # + cmpdi 5, 128 + blt __Process_more_dec + + # load 9 round keys + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 23, 240(6) # n rounds + +__Process_decrypt: +# +# Process 8x AES/GCM blocks +# +__Process_8x_dec: + # 8x blocks li 10, 128 - divdu 10, 5, 10 - cmpdi 10, 0 - beq .Loop_last_block_dec - - .long 0x13DEF8C0 - vxor 16, 30, 29 - .long 0x13DEF8C0 - vxor 17, 30, 29 - .long 0x13DEF8C0 - vxor 18, 30, 29 - .long 0x13DEF8C0 - vxor 19, 30, 29 - .long 0x13DEF8C0 - vxor 20, 30, 29 - .long 0x13DEF8C0 - vxor 21, 30, 29 - .long 0x13DEF8C0 - vxor 22, 30, 29 - - mtctr 10 + divdu 12, 5, 10 # n 128 bytes-blocks + + addi 12, 12, -1 # loop - 1 + + vmr 15, 30 # first state: IV + vadduwm 16, 15, 31 # state + counter + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + # vxor state, state, w # addroundkey + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 li 15, 16 li 16, 32 @@ -1062,279 +779,215 @@ _ppc_aes_gcm_decrypt: li 20, 96 li 21, 112 - lwz 10, 240(6) + # + # Pre-compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + # + addi 22, 23, -9 # process 8 keys + mtctr 22 # AES key loop + addi 10, 6, 144 -.Loop_8x_block_dec: + LOOP_8AES_STATE # process 8 AES keys - lxvb16x 15, 0, 14 - lxvb16x 16, 15, 14 - lxvb16x 17, 16, 14 - lxvb16x 18, 17, 14 - lxvb16x 19, 18, 14 - lxvb16x 20, 19, 14 - lxvb16x 21, 20, 14 - lxvb16x 22, 21, 14 +__PreLoop_aes_state_dec: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __PreLoop_aes_state_dec + lxv 32+1, 0(10) # last round key (v1) + + cmpdi 12, 0 # Only one loop (8 block) + beq __Finish_ghash_dec + +# +# Loop 8x blocks and compute ghash +# +__Loop_8x_block_dec: + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block addi 14, 14, 128 -.Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_last_aes_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_last_aes_dec - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_last_aes_dec - b aes_gcm_out - -Do_last_aes_dec: - - - - .long 0x11EFBD09 - .long 0x1210BD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 - - .long 0x1231BD09 - .long 0x1252BD09 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 - - .long 0x1273BD09 - .long 0x1294BD09 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 - - .long 0x12B5BD09 - .long 0x12D6BD09 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output addi 9, 9, 128 - xxlor 15+32, 15, 15 - xxlor 16+32, 16, 16 - xxlor 17+32, 17, 17 - xxlor 18+32, 18, 18 - xxlor 19+32, 19, 19 - xxlor 20+32, 20, 20 - xxlor 21+32, 21, 21 - xxlor 22+32, 22, 22 - - - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - .long 0x13DEF8C0 - vor 29,30,30 - vxor 15, 30, 27 - .long 0x13DEF8C0 - vxor 16, 30, 27 - .long 0x13DEF8C0 - vxor 17, 30, 27 - .long 0x13DEF8C0 - vxor 18, 30, 27 - .long 0x13DEF8C0 - vxor 19, 30, 27 - .long 0x13DEF8C0 - vxor 20, 30, 27 - .long 0x13DEF8C0 - vxor 21, 30, 27 - .long 0x13DEF8C0 - vxor 22, 30, 27 - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz .Loop_8x_block_dec - - vor 30,29,29 - -.Loop_last_block_dec: - cmpdi 12, 0 - beq aes_gcm_out - - - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10,240(6) - - cmpdi 12, 16 - blt Final_block_dec - -Next_rem_block_dec: - lxvb16x 15, 0, 14 - -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_1x_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x_dec - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x_dec - -Do_next_1x_dec: - .long 0x11EFBD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - addi 14, 14, 16 - addi 9, 9, 16 - - xxlor 28+32, 15, 15 - ppc_update_hash_1x + vmr 15, 23 + vmr 16, 24 + vmr 17, 25 + vmr 18, 26 + vmr 19, 27 + vmr 20, 28 + vmr 21, 29 + vmr 22, 30 - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - .long 0x13DEF8C0 - vxor 15, 30, 19 + # ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x + + xxlor 32+15, 9, 9 # last state + vadduwm 15, 15, 31 # state + counter + vadduwm 16, 15, 31 + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + xxlor 32+27, 0, 0 # restore roundkey 0 + vxor 15, 15, 27 # IV + round key - add round key 0 + vxor 16, 16, 27 + vxor 17, 17, 27 + vxor 18, 18, 27 + vxor 19, 19, 27 + vxor 20, 20, 27 + vxor 21, 21, 27 + vxor 22, 22, 27 - bdnz Next_rem_block_dec + addi 5, 5, -128 + addi 11, 11, 128 + + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + LOOP_8AES_STATE # process 8 AES keys + mtctr 22 # AES key loop + addi 10, 6, 144 +__LastLoop_aes_state_dec: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __LastLoop_aes_state_dec + lxv 32+1, 0(10) # last round key (v1) + addi 12, 12, -1 cmpdi 12, 0 - beq aes_gcm_out - -Final_block_dec: -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x_dec - + bne __Loop_8x_block_dec + +__Finish_ghash_dec: + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block + addi 14, 14, 128 - xxlor 24+32, 13, 13 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output + addi 9, 9, 128 - .long 0x11EFBD08 - .long 0x11EFC508 + vxor 15, 23, 0 + vmr 16, 24 + vmr 17, 25 + vmr 18, 26 + vmr 19, 27 + vmr 20, 28 + vmr 21, 29 + vmr 22, 30 - xxlor 23+32, 14, 14 + #vxor 15, 15, 0 + PPC_GFMUL128_8x - cmpdi 10, 14 - beq Do_final_1x_dec + xxlor 30+32, 9, 9 # last ctr + vadduwm 30, 30, 31 # increase ctr + stxvb16x 32+0, 0, 8 # update Xi -Do_final_1x_dec: - .long 0x11EFBD09 + addi 5, 5, -128 + addi 11, 11, 128 - lxvb16x 15, 0, 14 - xxlxor 47, 47, 15 + # + # Done 8x blocks + # + cmpdi 5, 0 + beq aes_gcm_out - li 15, 16 - sub 15, 15, 12 +__Process_more_dec: + li 24, 0 # decrypt + bl aes_gcm_crypt_1x + cmpdi 5, 0 + beq aes_gcm_out - vspltisb 16,-1 - vspltisb 17,0 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 - - addi 10, 1, 192 - lxvb16x 16, 15, 10 - xxland 47, 47, 16 + bl __Process_partial + b aes_gcm_out +.size ppc_aes_gcm_decrypt,.-ppc_aes_gcm_decrypt - xxlor 28+32, 15, 15 - ppc_update_hash_1x +aes_gcm_out: +.localentry aes_gcm_out,0 + mr 3, 11 # return count - bl Write_partial_block + RESTORE_REGS + blr +.size aes_gcm_out,.-aes_gcm_out - b aes_gcm_out +.rodata +.align 4 +# for vector permute and xor +permx: +.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3 diff --git a/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S b/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S index 2ff143c42ab7..51cfac7e45fc 100644 --- a/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S +++ b/sys/crypto/openssl/powerpc64le/aes-gcm-ppc.S @@ -1,532 +1,587 @@ /* Do not modify. This file is auto-generated from aes-gcm-ppc.pl. */ -.machine "any" -.abiversion 2 +.machine "any" .text - - - - -.macro .Loop_aes_middle4x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - .long 0x11EF9D08 - .long 0x12109D08 - .long 0x12319D08 - .long 0x12529D08 - - .long 0x11EFA508 - .long 0x1210A508 - .long 0x1231A508 - .long 0x1252A508 - - .long 0x11EFAD08 - .long 0x1210AD08 - .long 0x1231AD08 - .long 0x1252AD08 - - .long 0x11EFB508 - .long 0x1210B508 - .long 0x1231B508 - .long 0x1252B508 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - .long 0x11EF9D08 - .long 0x12109D08 - .long 0x12319D08 - .long 0x12529D08 - - .long 0x11EFA508 - .long 0x1210A508 - .long 0x1231A508 - .long 0x1252A508 - - .long 0x11EFAD08 - .long 0x1210AD08 - .long 0x1231AD08 - .long 0x1252AD08 - - .long 0x11EFB508 - .long 0x1210B508 - .long 0x1231B508 - .long 0x1252B508 - - xxlor 23+32, 9, 9 - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 -.endm - - - - - -.macro .Loop_aes_middle8x - xxlor 23+32, 1, 1 - xxlor 24+32, 2, 2 - xxlor 25+32, 3, 3 - xxlor 26+32, 4, 4 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - - xxlor 23+32, 5, 5 - xxlor 24+32, 6, 6 - xxlor 25+32, 7, 7 - xxlor 26+32, 8, 8 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - - xxlor 23+32, 9, 9 - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 -.endm - - - - -ppc_aes_gcm_ghash: - vxor 15, 15, 0 - - xxlxor 29, 29, 29 - - .long 0x12EC7CC8 - .long 0x130984C8 - .long 0x13268CC8 - .long 0x134394C8 - - vxor 23, 23, 24 - vxor 23, 23, 25 - vxor 23, 23, 26 - - .long 0x130D7CC8 - .long 0x132A84C8 - .long 0x13478CC8 - .long 0x136494C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 - - .long 0x130E7CC8 - .long 0x132B84C8 - .long 0x13488CC8 - .long 0x136594C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 - - blr - - - - - -.macro ppc_aes_gcm_ghash2_4x - - vxor 15, 15, 0 - - xxlxor 29, 29, 29 - - .long 0x12EC7CC8 - .long 0x130984C8 - .long 0x13268CC8 - .long 0x134394C8 +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-512(1) + + std 14, 112(1) + std 15, 120(1) + std 16, 128(1) + std 17, 136(1) + std 18, 144(1) + std 19, 152(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + std 24, 192(1) + + stxv 32+20, 256(1) + stxv 32+21, 256+16(1) + stxv 32+22, 256+32(1) + stxv 32+23, 256+48(1) + stxv 32+24, 256+64(1) + stxv 32+25, 256+80(1) + stxv 32+26, 256+96(1) + stxv 32+27, 256+112(1) + stxv 32+28, 256+128(1) + stxv 32+29, 256+144(1) + stxv 32+30, 256+160(1) + stxv 32+31, 256+176(1) +.endm # SAVE_REGS + +.macro RESTORE_REGS + lxv 32+20, 256(1) + lxv 32+21, 256+16(1) + lxv 32+22, 256+32(1) + lxv 32+23, 256+48(1) + lxv 32+24, 256+64(1) + lxv 32+25, 256+80(1) + lxv 32+26, 256+96(1) + lxv 32+27, 256+112(1) + lxv 32+28, 256+128(1) + lxv 32+29, 256+144(1) + lxv 32+30, 256+160(1) + lxv 32+31, 256+176(1) + + ld 14, 112(1) + ld 15, 120(1) + ld 16, 128(1) + ld 17, 136(1) + ld 18, 144(1) + ld 19, 152(1) + ld 20, 160(1) + ld 21, 168(1) + ld 22, 176(1) + ld 23, 184(1) + ld 24, 192(1) + + addi 1, 1, 512 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# 4x loops +.macro AES_CIPHER_4x r + vcipher 15, 15, \r + vcipher 16, 16, \r + vcipher 17, 17, \r + vcipher 18, 18, \r +.endm + +# 8x loops +.macro AES_CIPHER_8x r + vcipher 15, 15, \r + vcipher 16, 16, \r + vcipher 17, 17, \r + vcipher 18, 18, \r + vcipher 19, 19, \r + vcipher 20, 20, \r + vcipher 21, 21, \r + vcipher 22, 22, \r +.endm + +.macro LOOP_8AES_STATE + AES_CIPHER_8x 23 + AES_CIPHER_8x 24 + AES_CIPHER_8x 25 + AES_CIPHER_8x 26 + AES_CIPHER_8x 27 + AES_CIPHER_8x 28 + AES_CIPHER_8x 29 + AES_CIPHER_8x 1 +.endm + +# +# PPC_GFMUL128_8x: Compute hash values of 8 blocks based on Karatsuba method. +# +# S1 should xor with the previous digest +# +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# vs10: vpermxor vector +# Scratch: v23 - v29 +# +.macro PPC_GFMUL128_8x + + vpmsumd 23, 12, 15 # H4.L * X.L + vpmsumd 24, 9, 16 + vpmsumd 25, 6, 17 + vpmsumd 26, 3, 18 vxor 23, 23, 24 vxor 23, 23, 25 - vxor 23, 23, 26 - - .long 0x130D7CC8 - .long 0x132A84C8 - .long 0x13478CC8 - .long 0x136494C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - - vxor 24, 24, 27 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 + vxor 23, 23, 26 # L - .long 0x130E7CC8 - .long 0x132B84C8 - .long 0x13488CC8 - .long 0x136594C8 + vpmsumd 27, 13, 15 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, 16 # H3.L * X1.H + H3.H * X1.L + vpmsumd 25, 7, 17 + vpmsumd 26, 4, 18 + vxor 24, 27, 28 vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 27, 23, 27 - - - .long 0x1309A4C8 - .long 0x1326ACC8 - .long 0x1343B4C8 - vxor 19, 19, 27 - .long 0x12EC9CC8 + vxor 24, 24, 26 # M + + vpmsumd 26, 14, 15 # H4.H * X.H + vpmsumd 27, 11, 16 + vpmsumd 28, 8, 17 + vpmsumd 29, 5, 18 + + vxor 26, 26, 27 + vxor 26, 26, 28 + vxor 26, 26, 29 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L + + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+29, 10, 10 + vpermxor 23, 23, 28, 29 + + vxor 24, 26, 1 # H + + # sum hash and reduction with H Poly + # + # vsldoi 25, 23, 23, 8 # swap + # vpmsumd 23, 23, 2 + # vxor 27, 25, 24 + # + vpermxor 27, 23, 24, 29 + vpmsumd 23, 23, 2 + vxor 0, 23, 27 # Digest of 4 blocks + + vxor 19, 19, 0 + + # Compute digest for the next 4 blocks + vpmsumd 24, 9, 20 + vpmsumd 25, 6, 21 + vpmsumd 26, 3, 22 + vpmsumd 23, 12, 19 # H4.L * X.L vxor 23, 23, 24 vxor 23, 23, 25 - vxor 23, 23, 26 + vxor 23, 23, 26 # L - .long 0x130D9CC8 - .long 0x132AA4C8 - .long 0x1347ACC8 - .long 0x1364B4C8 + vpmsumd 27, 13, 19 # H4.L * X.H + H4.H * X.L + vpmsumd 28, 10, 20 # H3.L * X1.H + H3.H * X1.L + vpmsumd 25, 7, 21 + vpmsumd 26, 4, 22 + vxor 24, 27, 28 vxor 24, 24, 25 - vxor 24, 24, 26 - - - .long 0x139714C8 - - xxlor 29+32, 29, 29 - - vxor 24, 24, 27 - vsldoi 26, 24, 29, 8 - vsldoi 29, 29, 24, 8 - vxor 23, 23, 26 - - vsldoi 23, 23, 23, 8 - vxor 23, 23, 28 - - .long 0x130E9CC8 - .long 0x132BA4C8 - .long 0x1348ACC8 - .long 0x1365B4C8 - - vxor 24, 24, 25 - vxor 24, 24, 26 - vxor 24, 24, 27 - - vxor 24, 24, 29 - - - vsldoi 27, 23, 23, 8 - .long 0x12F714C8 - vxor 27, 27, 24 - vxor 23, 23, 27 - - xxlor 32, 23+32, 23+32 - -.endm - - - - -.macro ppc_update_hash_1x - vxor 28, 28, 0 - - vxor 19, 19, 19 - - .long 0x12C3E4C8 - .long 0x12E4E4C8 - .long 0x1305E4C8 - - .long 0x137614C8 - - vsldoi 25, 23, 19, 8 - vsldoi 26, 19, 23, 8 - vxor 22, 22, 25 - vxor 24, 24, 26 - - vsldoi 22, 22, 22, 8 - vxor 22, 22, 27 - - vsldoi 20, 22, 22, 8 - .long 0x12D614C8 - vxor 20, 20, 24 - vxor 22, 22, 20 - - vor 0,22,22 - -.endm - - - - - - - - - - - - - -.global ppc_aes_gcm_encrypt -.align 5 -ppc_aes_gcm_encrypt: -_ppc_aes_gcm_encrypt: - - stdu 1,-512(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - std 0, 528(1) - - - lxvb16x 32, 0, 8 - + vxor 24, 24, 26 # M + + vpmsumd 26, 14, 19 # H4.H * X.H + vpmsumd 27, 11, 20 + vpmsumd 28, 8, 21 + vpmsumd 29, 5, 22 + + vxor 26, 26, 27 + vxor 26, 26, 28 + vxor 26, 26, 29 + + # sum hash and reduction with H Poly + vpmsumd 28, 23, 2 # reduction + + vxor 1, 1, 1 + vsldoi 25, 24, 1, 8 # mL + vsldoi 1, 1, 24, 8 # mH + vxor 23, 23, 25 # mL + L + + # This performs swap and xor like, + # vsldoi 23, 23, 23, 8 # swap + # vxor 23, 23, 28 + xxlor 32+29, 10, 10 + vpermxor 23, 23, 28, 29 + + vxor 24, 26, 1 # H + + # sum hash and reduction with H Poly + # + # vsldoi 25, 23, 23, 8 # swap + # vpmsumd 23, 23, 2 + # vxor 27, 25, 24 + # + vpermxor 27, 23, 24, 29 + vpmsumd 23, 23, 2 + vxor 0, 23, 27 # Digest of 8 blocks +.endm + +# +# Compute update single ghash +# vs10: vpermxor vector +# scratch: v1, v22..v27 +# +.macro PPC_GHASH1x H S1 + + vxor 1, 1, 1 + + vpmsumd 22, 3, \S1 # L + vpmsumd 23, 4, \S1 # M + vpmsumd 24, 5, \S1 # H + + vpmsumd 27, 22, 2 # reduction + + vsldoi 25, 23, 1, 8 # mL + vsldoi 26, 1, 23, 8 # mH + vxor 22, 22, 25 # LL + LL + vxor 24, 24, 26 # HH + HH + + xxlor 32+25, 10, 10 + vpermxor 22, 22, 27, 25 + + # vsldoi 23, 22, 22, 8 # swap + # vpmsumd 22, 22, 2 # reduction + # vxor 23, 23, 24 + vpermxor 23, 22, 24, 25 + vpmsumd 22, 22, 2 # reduction + + vxor \H, 22, 23 +.endm + +# +# LOAD_HASH_TABLE +# Xi = v0 +# H Poly = v2 +# Hash keys = v3 - v14 +# +.macro LOAD_HASH_TABLE + # Load Xi + lxvb16x 32, 0, 8 # load Xi + + vxor 1, 1, 1 li 10, 32 - lxvd2x 2+32, 10, 8 - li 10, 48 - lxvd2x 3+32, 10, 8 - li 10, 64 - lxvd2x 4+32, 10, 8 - li 10, 80 - lxvd2x 5+32, 10, 8 + lxvd2x 2+32, 10, 8 # H Poli - li 10, 96 - lxvd2x 6+32, 10, 8 + # load Hash - h^4, h^3, h^2, h + li 10, 64 + lxvd2x 4+32, 10, 8 # H + vsldoi 3, 1, 4, 8 # l + vsldoi 5, 4, 1, 8 # h li 10, 112 - lxvd2x 7+32, 10, 8 - li 10, 128 - lxvd2x 8+32, 10, 8 - - li 10, 144 - lxvd2x 9+32, 10, 8 + lxvd2x 7+32, 10, 8 # H^2 + vsldoi 6, 1, 7, 8 # l + vsldoi 8, 7, 1, 8 # h li 10, 160 - lxvd2x 10+32, 10, 8 - li 10, 176 - lxvd2x 11+32, 10, 8 - - li 10, 192 - lxvd2x 12+32, 10, 8 + lxvd2x 10+32, 10, 8 # H^3 + vsldoi 9, 1, 10, 8 # l + vsldoi 11, 10, 1, 8 # h li 10, 208 - lxvd2x 13+32, 10, 8 - li 10, 224 - lxvd2x 14+32, 10, 8 - - - lxvb16x 30+32, 0, 7 - - mr 12, 5 - li 11, 0 - - - vxor 31, 31, 31 - vspltisb 22,1 - vsldoi 31, 31, 22,1 - - - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - - lwz 9,240(6) + lxvd2x 13+32, 10, 8 # H^4 + vsldoi 12, 1, 13, 8 # l + vsldoi 14, 13, 1, 8 # h +.endm + +.macro PROCESS_8X_AES_STATES + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block + addi 14, 14, 128 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output + addi 9, 9, 128 +.endm + +.macro COMPUTE_STATES + xxlor 32+15, 9, 9 # last state + vadduwm 15, 15, 31 # state + counter + vadduwm 16, 15, 31 + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 +.endm + +################################################################################ +# Compute AES and ghash one block at a time. +# r23: AES rounds +# v30: current IV +# vs0: roundkey 0 +# +################################################################################ +.align 4 +aes_gcm_crypt_1x: +.localentry aes_gcm_crypt_1x,0 + + cmpdi 5, 16 + bge __More_1x + blr +__More_1x: + li 10, 16 + divdu 12, 5, 10 + + xxlxor 32+15, 32+30, 0 + + # Pre-load 8 AES rounds to scratch vectors. + lxv 32+16, 16(6) # round key 1 + lxv 32+17, 32(6) # round key 2 + lxv 32+18, 48(6) # round key 3 + lxv 32+19, 64(6) # round key 4 + lxv 32+20, 80(6) # round key 5 + lxv 32+21, 96(6) # round key 6 + lxv 32+28, 112(6) # round key 7 + lxv 32+29, 128(6) # round key 8 + + lwz 23, 240(6) # n rounds + addi 22, 23, -9 # remaining AES rounds + cmpdi 12, 0 + bgt __Loop_1x + blr + +__Loop_1x: + mtctr 22 + addi 10, 6, 144 + vcipher 15, 15, 16 + vcipher 15, 15, 17 + vcipher 15, 15, 18 + vcipher 15, 15, 19 + vcipher 15, 15, 20 + vcipher 15, 15, 21 + vcipher 15, 15, 28 + vcipher 15, 15, 29 + +__Loop_aes_1state: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_1state + lxv 32+1, 0(10) # last round key + lxvb16x 11, 0, 14 # load input block + vcipherlast 15, 15, 1 - xxlor 32+29, 0, 0 - vxor 15, 30, 29 + xxlxor 32+15, 32+15, 11 + stxvb16x 32+15, 0, 9 # store output + addi 14, 14, 16 + addi 9, 9, 16 - cmpdi 9, 10 - beq .Loop_aes_gcm_8x + cmpdi 24, 0 # decrypt? + bne __Encrypt_1x + xxlor 15+32, 11, 11 +__Encrypt_1x: + vxor 15, 15, 0 + PPC_GHASH1x 0, 15 + addi 5, 5, -16 + addi 11, 11, 16 - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) + vadduwm 30, 30, 31 # IV + counter + xxlxor 32+15, 32+30, 0 + addi 12, 12, -1 + cmpdi 12, 0 + bgt __Loop_1x + + stxvb16x 32+0, 0, 8 # update Xi + blr +.size aes_gcm_crypt_1x,.-aes_gcm_crypt_1x + +################################################################################ +# Process a normal partial block when we come here. +# Compute partial mask, Load and store partial block to stack. +# Compute AES state. +# Compute ghash. +# +################################################################################ +.align 4 +__Process_partial: +.localentry __Process_partial,0 + + # create partial mask + vspltisb 16, -1 + li 12, 16 + sub 12, 12, 5 + sldi 12, 12, 3 + mtvsrdd 32+17, 0, 12 + vslo 16, 16, 17 # partial block mask + + lxvb16x 11, 0, 14 # load partial block + xxland 11, 11, 32+16 + + # AES crypt partial + xxlxor 32+15, 32+30, 0 + lwz 23, 240(6) # n rounds + addi 22, 23, -1 # loop - 1 + mtctr 22 + addi 10, 6, 16 + +__Loop_aes_pstate: + lxv 32+1, 0(10) + vcipher 15, 15, 1 + addi 10, 10, 16 + bdnz __Loop_aes_pstate + lxv 32+1, 0(10) # last round key + vcipherlast 15, 15, 1 - cmpdi 9, 12 - beq .Loop_aes_gcm_8x + xxlxor 32+15, 32+15, 11 + vand 15, 15, 16 + # AES crypt output v15 + # Write partial + li 10, 224 + stxvb16x 15+32, 10, 1 # write v15 to stack + addi 10, 1, 223 + addi 12, 9, -1 + mtctr 5 # partial block len +__Write_partial: + lbzu 22, 1(10) + stbu 22, 1(12) + bdnz __Write_partial + + cmpdi 24, 0 # decrypt? + bne __Encrypt_partial + xxlor 32+15, 11, 11 # decrypt using the input block +__Encrypt_partial: + vxor 15, 15, 0 # ^ previous hash + PPC_GHASH1x 0, 15 + li 5, 0 # done last byte + stxvb16x 32+0, 0, 8 # Update X1 + blr +.size __Process_partial,.-__Process_partial + +################################################################################ +# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len, +# const char *rk, unsigned char iv[16], void *Xip); +# +# r3 - inp +# r4 - out +# r5 - len +# r6 - AES round keys +# r7 - iv +# r8 - Xi, HPoli, hash keys +# +# rounds is at offset 240 in rk +# Xi is at 0 in gcm_table (Xip). +# +################################################################################ +.global ppc_aes_gcm_encrypt +.align 5 +ppc_aes_gcm_encrypt: +.localentry ppc_aes_gcm_encrypt,0 - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq .Loop_aes_gcm_8x + SAVE_REGS + LOAD_HASH_TABLE - b aes_gcm_out + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 -.align 5 -.Loop_aes_gcm_8x: mr 14, 3 mr 9, 4 + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxv 10, 0(11) # vs10: vpermxor vector + li 11, 0 + lxv 0, 0(6) # round key 0 + + # + # Process different blocks + # + cmpdi 5, 128 + blt __Process_more_enc + + # load 9 round keys + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 23, 240(6) # n rounds + +__Process_encrypt: +# +# Process 8x AES/GCM blocks +# +__Process_8x_enc: + # 8x blocks li 10, 128 - divdu 10, 5, 10 - cmpdi 10, 0 - beq .Loop_last_block - - .long 0x13DEF8C0 - vxor 16, 30, 29 - .long 0x13DEF8C0 - vxor 17, 30, 29 - .long 0x13DEF8C0 - vxor 18, 30, 29 - .long 0x13DEF8C0 - vxor 19, 30, 29 - .long 0x13DEF8C0 - vxor 20, 30, 29 - .long 0x13DEF8C0 - vxor 21, 30, 29 - .long 0x13DEF8C0 - vxor 22, 30, 29 - - mtctr 10 + divdu 12, 5, 10 # n 128 bytes-blocks + + addi 12, 12, -1 # loop - 1 + + vmr 15, 30 # first state: IV + vadduwm 16, 15, 31 # state + counter + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + # vxor state, state, w # addroundkey + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 li 15, 16 li 16, 32 @@ -536,523 +591,185 @@ _ppc_aes_gcm_encrypt: li 20, 96 li 21, 112 - lwz 10, 240(6) - -.Loop_8x_block: - - lxvb16x 15, 0, 14 - lxvb16x 16, 15, 14 - lxvb16x 17, 16, 14 - lxvb16x 18, 17, 14 - lxvb16x 19, 18, 14 - lxvb16x 20, 19, 14 - lxvb16x 21, 20, 14 - lxvb16x 22, 21, 14 - addi 14, 14, 128 - -.Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_ghash - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_ghash - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_ghash - b aes_gcm_out - -Do_next_ghash: - - - - .long 0x11EFBD09 - .long 0x1210BD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 - - .long 0x1231BD09 - .long 0x1252BD09 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 - - .long 0x1273BD09 - .long 0x1294BD09 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 - - .long 0x12B5BD09 - .long 0x12D6BD09 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 - - addi 9, 9, 128 - - - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - .long 0x13DEF8C0 - vor 29,30,30 - vxor 15, 30, 27 - .long 0x13DEF8C0 - vxor 16, 30, 27 - .long 0x13DEF8C0 - vxor 17, 30, 27 - .long 0x13DEF8C0 - vxor 18, 30, 27 - .long 0x13DEF8C0 - vxor 19, 30, 27 - .long 0x13DEF8C0 - vxor 20, 30, 27 - .long 0x13DEF8C0 - vxor 21, 30, 27 - .long 0x13DEF8C0 - vxor 22, 30, 27 - - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz .Loop_8x_block - - vor 30,29,29 - -.Loop_last_block: - cmpdi 12, 0 - beq aes_gcm_out - - - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10, 240(6) - - cmpdi 12, 16 - blt Final_block - -.macro .Loop_aes_middle_1x - xxlor 19+32, 1, 1 - xxlor 20+32, 2, 2 - xxlor 21+32, 3, 3 - xxlor 22+32, 4, 4 - - .long 0x11EF9D08 - .long 0x11EFA508 - .long 0x11EFAD08 - .long 0x11EFB508 - - xxlor 19+32, 5, 5 - xxlor 20+32, 6, 6 - xxlor 21+32, 7, 7 - xxlor 22+32, 8, 8 - - .long 0x11EF9D08 - .long 0x11EFA508 - .long 0x11EFAD08 - .long 0x11EFB508 - - xxlor 19+32, 9, 9 - .long 0x11EF9D08 -.endm - -Next_rem_block: - lxvb16x 15, 0, 14 - -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 + # + # Pre-compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + # + addi 22, 23, -9 # process 8 keys + mtctr 22 # AES key loop + addi 10, 6, 144 - cmpdi 10, 10 - beq Do_next_1x + LOOP_8AES_STATE # process 8 AES keys +__PreLoop_aes_state: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __PreLoop_aes_state + lxv 32+1, 0(10) # last round key (v1) - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x - -Do_next_1x: - .long 0x11EFBD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - addi 14, 14, 16 - addi 9, 9, 16 + cmpdi 12, 0 # Only one loop (8 block) + beq __Finish_ghash - vor 28,15,15 - ppc_update_hash_1x +# +# Loop 8x blocks and compute ghash +# +__Loop_8x_block_enc: + PROCESS_8X_AES_STATES - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - .long 0x13DEF8C0 - vxor 15, 30, 19 + # Compute ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x + + COMPUTE_STATES + + addi 5, 5, -128 + addi 11, 11, 128 + + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # Compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + LOOP_8AES_STATE # process 8 AES keys + mtctr 22 # AES key loop + addi 10, 6, 144 + +__LastLoop_aes_state: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __LastLoop_aes_state - bdnz Next_rem_block + lxv 32+1, 0(10) # last round key (v1) + addi 12, 12, -1 cmpdi 12, 0 - beq aes_gcm_out - -Final_block: -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x + bne __Loop_8x_block_enc + # + # Remainng blocks + # +__Finish_ghash: + PROCESS_8X_AES_STATES - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_final_1x - -Do_final_1x: - .long 0x11EFBD09 - - lxvb16x 15, 0, 14 - xxlxor 47, 47, 15 - - - li 15, 16 - sub 15, 15, 12 + # Compute ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x - vspltisb 16,-1 - vspltisb 17,0 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 + # Update IV and Xi + xxlor 30+32, 9, 9 # last ctr + vadduwm 30, 30, 31 # increase ctr + stxvb16x 32+0, 0, 8 # update Xi - addi 10, 1, 192 - lxvb16x 16, 15, 10 - xxland 47, 47, 16 + addi 5, 5, -128 + addi 11, 11, 128 - vor 28,15,15 - ppc_update_hash_1x + # + # Done 8x blocks + # + cmpdi 5, 0 + beq aes_gcm_out - bl Write_partial_block +__Process_more_enc: + li 24, 1 # encrypt + bl aes_gcm_crypt_1x + cmpdi 5, 0 + beq aes_gcm_out + bl __Process_partial b aes_gcm_out +.size ppc_aes_gcm_encrypt,.-ppc_aes_gcm_encrypt - - - - - -Write_partial_block: - li 10, 192 - stxvb16x 15+32, 10, 1 - - - addi 10, 9, -1 - addi 16, 1, 191 - - mtctr 12 - li 15, 0 - -Write_last_byte: - lbzu 14, 1(16) - stbu 14, 1(10) - bdnz Write_last_byte - blr - -aes_gcm_out: - - stxvb16x 32, 0, 8 - add 3, 11, 12 - - li 9, 256 - lvx 20, 9, 1 - addi 9, 9, 16 - lvx 21, 9, 1 - addi 9, 9, 16 - lvx 22, 9, 1 - addi 9, 9, 16 - lvx 23, 9, 1 - addi 9, 9, 16 - lvx 24, 9, 1 - addi 9, 9, 16 - lvx 25, 9, 1 - addi 9, 9, 16 - lvx 26, 9, 1 - addi 9, 9, 16 - lvx 27, 9, 1 - addi 9, 9, 16 - lvx 28, 9, 1 - addi 9, 9, 16 - lvx 29, 9, 1 - addi 9, 9, 16 - lvx 30, 9, 1 - addi 9, 9, 16 - lvx 31, 9, 1 - - ld 0, 528(1) - ld 14,112(1) - ld 15,120(1) - ld 16,128(1) - ld 17,136(1) - ld 18,144(1) - ld 19,152(1) - ld 20,160(1) - ld 21,168(1) - - mtlr 0 - addi 1, 1, 512 - blr - - - - -.global ppc_aes_gcm_decrypt -.align 5 +################################################################################ +# ppc_aes_gcm_decrypt (const void *inp, void *out, size_t len, +# const char *rk, unsigned char iv[16], void *Xip); +# 8x Decrypt +# +################################################################################ +.global ppc_aes_gcm_decrypt +.align 5 ppc_aes_gcm_decrypt: -_ppc_aes_gcm_decrypt: - - stdu 1,-512(1) - mflr 0 - - std 14,112(1) - std 15,120(1) - std 16,128(1) - std 17,136(1) - std 18,144(1) - std 19,152(1) - std 20,160(1) - std 21,168(1) - li 9, 256 - stvx 20, 9, 1 - addi 9, 9, 16 - stvx 21, 9, 1 - addi 9, 9, 16 - stvx 22, 9, 1 - addi 9, 9, 16 - stvx 23, 9, 1 - addi 9, 9, 16 - stvx 24, 9, 1 - addi 9, 9, 16 - stvx 25, 9, 1 - addi 9, 9, 16 - stvx 26, 9, 1 - addi 9, 9, 16 - stvx 27, 9, 1 - addi 9, 9, 16 - stvx 28, 9, 1 - addi 9, 9, 16 - stvx 29, 9, 1 - addi 9, 9, 16 - stvx 30, 9, 1 - addi 9, 9, 16 - stvx 31, 9, 1 - std 0, 528(1) - - - lxvb16x 32, 0, 8 - - - li 10, 32 - lxvd2x 2+32, 10, 8 - li 10, 48 - lxvd2x 3+32, 10, 8 - li 10, 64 - lxvd2x 4+32, 10, 8 - li 10, 80 - lxvd2x 5+32, 10, 8 - - li 10, 96 - lxvd2x 6+32, 10, 8 - li 10, 112 - lxvd2x 7+32, 10, 8 - li 10, 128 - lxvd2x 8+32, 10, 8 - - li 10, 144 - lxvd2x 9+32, 10, 8 - li 10, 160 - lxvd2x 10+32, 10, 8 - li 10, 176 - lxvd2x 11+32, 10, 8 - - li 10, 192 - lxvd2x 12+32, 10, 8 - li 10, 208 - lxvd2x 13+32, 10, 8 - li 10, 224 - lxvd2x 14+32, 10, 8 - - - lxvb16x 30+32, 0, 7 - - mr 12, 5 - li 11, 0 - +.localentry ppc_aes_gcm_decrypt, 0 - vxor 31, 31, 31 - vspltisb 22,1 - vsldoi 31, 31, 22,1 - - - lxv 0, 0(6) - lxv 1, 0x10(6) - lxv 2, 0x20(6) - lxv 3, 0x30(6) - lxv 4, 0x40(6) - lxv 5, 0x50(6) - lxv 6, 0x60(6) - lxv 7, 0x70(6) - lxv 8, 0x80(6) - lxv 9, 0x90(6) - lxv 10, 0xa0(6) - - - lwz 9,240(6) - - - - xxlor 32+29, 0, 0 - vxor 15, 30, 29 - - cmpdi 9, 10 - beq .Loop_aes_gcm_8x_dec - - - lxv 11, 0xb0(6) - lxv 12, 0xc0(6) - - cmpdi 9, 12 - beq .Loop_aes_gcm_8x_dec + SAVE_REGS + LOAD_HASH_TABLE + # initialize ICB: GHASH( IV ), IV - r7 + lxvb16x 30+32, 0, 7 # load IV - v30 - lxv 13, 0xd0(6) - lxv 14, 0xe0(6) - cmpdi 9, 14 - beq .Loop_aes_gcm_8x_dec - - b aes_gcm_out - -.align 5 -.Loop_aes_gcm_8x_dec: mr 14, 3 mr 9, 4 + # counter 1 + vxor 31, 31, 31 + vspltisb 22, 1 + vsldoi 31, 31, 22,1 # counter 1 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxv 10, 0(11) # vs10: vpermxor vector + li 11, 0 + lxv 0, 0(6) # round key 0 + + # + # Process different blocks + # + cmpdi 5, 128 + blt __Process_more_dec + + # load 9 round keys + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + # load rounds - 10 (128), 12 (192), 14 (256) + lwz 23, 240(6) # n rounds + +__Process_decrypt: +# +# Process 8x AES/GCM blocks +# +__Process_8x_dec: + # 8x blocks li 10, 128 - divdu 10, 5, 10 - cmpdi 10, 0 - beq .Loop_last_block_dec - - .long 0x13DEF8C0 - vxor 16, 30, 29 - .long 0x13DEF8C0 - vxor 17, 30, 29 - .long 0x13DEF8C0 - vxor 18, 30, 29 - .long 0x13DEF8C0 - vxor 19, 30, 29 - .long 0x13DEF8C0 - vxor 20, 30, 29 - .long 0x13DEF8C0 - vxor 21, 30, 29 - .long 0x13DEF8C0 - vxor 22, 30, 29 - - mtctr 10 + divdu 12, 5, 10 # n 128 bytes-blocks + + addi 12, 12, -1 # loop - 1 + + vmr 15, 30 # first state: IV + vadduwm 16, 15, 31 # state + counter + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + # vxor state, state, w # addroundkey + xxlxor 32+15, 32+15, 0 # IV + round key - add round key 0 + xxlxor 32+16, 32+16, 0 + xxlxor 32+17, 32+17, 0 + xxlxor 32+18, 32+18, 0 + xxlxor 32+19, 32+19, 0 + xxlxor 32+20, 32+20, 0 + xxlxor 32+21, 32+21, 0 + xxlxor 32+22, 32+22, 0 li 15, 16 li 16, 32 @@ -1062,279 +779,215 @@ _ppc_aes_gcm_decrypt: li 20, 96 li 21, 112 - lwz 10, 240(6) + # + # Pre-compute first 8 AES state and leave 1/3/5 more rounds + # for the loop. + # + addi 22, 23, -9 # process 8 keys + mtctr 22 # AES key loop + addi 10, 6, 144 -.Loop_8x_block_dec: + LOOP_8AES_STATE # process 8 AES keys - lxvb16x 15, 0, 14 - lxvb16x 16, 15, 14 - lxvb16x 17, 16, 14 - lxvb16x 18, 17, 14 - lxvb16x 19, 18, 14 - lxvb16x 20, 19, 14 - lxvb16x 21, 20, 14 - lxvb16x 22, 21, 14 +__PreLoop_aes_state_dec: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __PreLoop_aes_state_dec + lxv 32+1, 0(10) # last round key (v1) + + cmpdi 12, 0 # Only one loop (8 block) + beq __Finish_ghash_dec + +# +# Loop 8x blocks and compute ghash +# +__Loop_8x_block_dec: + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block addi 14, 14, 128 -.Loop_aes_middle8x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_last_aes_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_last_aes_dec - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x1210BD08 - .long 0x1231BD08 - .long 0x1252BD08 - .long 0x1273BD08 - .long 0x1294BD08 - .long 0x12B5BD08 - .long 0x12D6BD08 - - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_last_aes_dec - b aes_gcm_out - -Do_last_aes_dec: - - - - .long 0x11EFBD09 - .long 0x1210BD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - xxlxor 48, 48, 16 - stxvb16x 48, 15, 9 - - .long 0x1231BD09 - .long 0x1252BD09 - - xxlxor 49, 49, 17 - stxvb16x 49, 16, 9 - xxlxor 50, 50, 18 - stxvb16x 50, 17, 9 - - .long 0x1273BD09 - .long 0x1294BD09 - - xxlxor 51, 51, 19 - stxvb16x 51, 18, 9 - xxlxor 52, 52, 20 - stxvb16x 52, 19, 9 - - .long 0x12B5BD09 - .long 0x12D6BD09 - - xxlxor 53, 53, 21 - stxvb16x 53, 20, 9 - xxlxor 54, 54, 22 - stxvb16x 54, 21, 9 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output addi 9, 9, 128 - xxlor 15+32, 15, 15 - xxlor 16+32, 16, 16 - xxlor 17+32, 17, 17 - xxlor 18+32, 18, 18 - xxlor 19+32, 19, 19 - xxlor 20+32, 20, 20 - xxlor 21+32, 21, 21 - xxlor 22+32, 22, 22 - - - ppc_aes_gcm_ghash2_4x - - xxlor 27+32, 0, 0 - .long 0x13DEF8C0 - vor 29,30,30 - vxor 15, 30, 27 - .long 0x13DEF8C0 - vxor 16, 30, 27 - .long 0x13DEF8C0 - vxor 17, 30, 27 - .long 0x13DEF8C0 - vxor 18, 30, 27 - .long 0x13DEF8C0 - vxor 19, 30, 27 - .long 0x13DEF8C0 - vxor 20, 30, 27 - .long 0x13DEF8C0 - vxor 21, 30, 27 - .long 0x13DEF8C0 - vxor 22, 30, 27 - addi 12, 12, -128 - addi 11, 11, 128 - - bdnz .Loop_8x_block_dec - - vor 30,29,29 - -.Loop_last_block_dec: - cmpdi 12, 0 - beq aes_gcm_out - - - li 10, 16 - divdu 10, 12, 10 - - mtctr 10 - - lwz 10,240(6) - - cmpdi 12, 16 - blt Final_block_dec - -Next_rem_block_dec: - lxvb16x 15, 0, 14 - -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_next_1x_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_next_1x_dec - - - xxlor 24+32, 13, 13 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 14, 14 - - cmpdi 10, 14 - beq Do_next_1x_dec - -Do_next_1x_dec: - .long 0x11EFBD09 - - xxlxor 47, 47, 15 - stxvb16x 47, 0, 9 - addi 14, 14, 16 - addi 9, 9, 16 - - xxlor 28+32, 15, 15 - ppc_update_hash_1x + vmr 15, 23 + vmr 16, 24 + vmr 17, 25 + vmr 18, 26 + vmr 19, 27 + vmr 20, 28 + vmr 21, 29 + vmr 22, 30 - addi 12, 12, -16 - addi 11, 11, 16 - xxlor 19+32, 0, 0 - .long 0x13DEF8C0 - vxor 15, 30, 19 + # ghash here + vxor 15, 15, 0 + PPC_GFMUL128_8x + + xxlor 32+15, 9, 9 # last state + vadduwm 15, 15, 31 # state + counter + vadduwm 16, 15, 31 + vadduwm 17, 16, 31 + vadduwm 18, 17, 31 + vadduwm 19, 18, 31 + vadduwm 20, 19, 31 + vadduwm 21, 20, 31 + vadduwm 22, 21, 31 + xxlor 9, 32+22, 32+22 # save last state + + xxlor 32+27, 0, 0 # restore roundkey 0 + vxor 15, 15, 27 # IV + round key - add round key 0 + vxor 16, 16, 27 + vxor 17, 17, 27 + vxor 18, 18, 27 + vxor 19, 19, 27 + vxor 20, 20, 27 + vxor 21, 21, 27 + vxor 22, 22, 27 - bdnz Next_rem_block_dec + addi 5, 5, -128 + addi 11, 11, 128 + + lxv 32+23, 16(6) # round key 1 + lxv 32+24, 32(6) # round key 2 + lxv 32+25, 48(6) # round key 3 + lxv 32+26, 64(6) # round key 4 + lxv 32+27, 80(6) # round key 5 + lxv 32+28, 96(6) # round key 6 + lxv 32+29, 112(6) # round key 7 + lxv 32+1, 128(6) # round key 8 + + LOOP_8AES_STATE # process 8 AES keys + mtctr 22 # AES key loop + addi 10, 6, 144 +__LastLoop_aes_state_dec: + lxv 32+1, 0(10) # round key + AES_CIPHER_8x 1 + addi 10, 10, 16 + bdnz __LastLoop_aes_state_dec + lxv 32+1, 0(10) # last round key (v1) + addi 12, 12, -1 cmpdi 12, 0 - beq aes_gcm_out - -Final_block_dec: -.Loop_aes_middle_1x - - xxlor 23+32, 10, 10 - - cmpdi 10, 10 - beq Do_final_1x_dec - - - xxlor 24+32, 11, 11 - - .long 0x11EFBD08 - .long 0x11EFC508 - - xxlor 23+32, 12, 12 - - cmpdi 10, 12 - beq Do_final_1x_dec - + bne __Loop_8x_block_dec + +__Finish_ghash_dec: + vcipherlast 15, 15, 1 + vcipherlast 16, 16, 1 + vcipherlast 17, 17, 1 + vcipherlast 18, 18, 1 + vcipherlast 19, 19, 1 + vcipherlast 20, 20, 1 + vcipherlast 21, 21, 1 + vcipherlast 22, 22, 1 + + lxvb16x 32+23, 0, 14 # load block + lxvb16x 32+24, 15, 14 # load block + lxvb16x 32+25, 16, 14 # load block + lxvb16x 32+26, 17, 14 # load block + lxvb16x 32+27, 18, 14 # load block + lxvb16x 32+28, 19, 14 # load block + lxvb16x 32+29, 20, 14 # load block + lxvb16x 32+30, 21, 14 # load block + addi 14, 14, 128 - xxlor 24+32, 13, 13 + vxor 15, 15, 23 + vxor 16, 16, 24 + vxor 17, 17, 25 + vxor 18, 18, 26 + vxor 19, 19, 27 + vxor 20, 20, 28 + vxor 21, 21, 29 + vxor 22, 22, 30 + + stxvb16x 47, 0, 9 # store output + stxvb16x 48, 15, 9 # store output + stxvb16x 49, 16, 9 # store output + stxvb16x 50, 17, 9 # store output + stxvb16x 51, 18, 9 # store output + stxvb16x 52, 19, 9 # store output + stxvb16x 53, 20, 9 # store output + stxvb16x 54, 21, 9 # store output + addi 9, 9, 128 - .long 0x11EFBD08 - .long 0x11EFC508 + vxor 15, 23, 0 + vmr 16, 24 + vmr 17, 25 + vmr 18, 26 + vmr 19, 27 + vmr 20, 28 + vmr 21, 29 + vmr 22, 30 - xxlor 23+32, 14, 14 + #vxor 15, 15, 0 + PPC_GFMUL128_8x - cmpdi 10, 14 - beq Do_final_1x_dec + xxlor 30+32, 9, 9 # last ctr + vadduwm 30, 30, 31 # increase ctr + stxvb16x 32+0, 0, 8 # update Xi -Do_final_1x_dec: - .long 0x11EFBD09 + addi 5, 5, -128 + addi 11, 11, 128 - lxvb16x 15, 0, 14 - xxlxor 47, 47, 15 + # + # Done 8x blocks + # + cmpdi 5, 0 + beq aes_gcm_out - li 15, 16 - sub 15, 15, 12 +__Process_more_dec: + li 24, 0 # decrypt + bl aes_gcm_crypt_1x + cmpdi 5, 0 + beq aes_gcm_out - vspltisb 16,-1 - vspltisb 17,0 - li 10, 192 - stvx 16, 10, 1 - addi 10, 10, 16 - stvx 17, 10, 1 - - addi 10, 1, 192 - lxvb16x 16, 15, 10 - xxland 47, 47, 16 + bl __Process_partial + b aes_gcm_out +.size ppc_aes_gcm_decrypt,.-ppc_aes_gcm_decrypt - xxlor 28+32, 15, 15 - ppc_update_hash_1x +aes_gcm_out: +.localentry aes_gcm_out,0 + mr 3, 11 # return count - bl Write_partial_block + RESTORE_REGS + blr +.size aes_gcm_out,.-aes_gcm_out - b aes_gcm_out +.rodata +.align 4 +# for vector permute and xor +permx: +.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3 |
