diff options
Diffstat (limited to 'lib/libmd/amd64')
-rw-r--r-- | lib/libmd/amd64/sha1block.S | 1851 | ||||
-rw-r--r-- | lib/libmd/amd64/sha1dispatch.c | 77 |
2 files changed, 1928 insertions, 0 deletions
diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S new file mode 100644 index 000000000000..f1291ef2647a --- /dev/null +++ b/lib/libmd/amd64/sha1block.S @@ -0,0 +1,1851 @@ +/*- + * Copyright (c) 2013 The Go Authors. All rights reserved. + * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * + * Adapted from Go's crypto/sha1/sha1block_amd64.s. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <machine/asm.h> + +/* + * SHA-1 block routine. See sha1c.c for C equivalent. + * + * There are 80 rounds of 4 types: + * - rounds 0-15 are type 1 and load data (round1 macro). + * - rounds 16-19 are type 1 and do not load data (round1x macro). + * - rounds 20-39 are type 2 and do not load data (round2 macro). + * - rounds 40-59 are type 3 and do not load data (round3 macro). + * - rounds 60-79 are type 4 and do not load data (round4 macro). + * + * Each round loads or shuffles the data, then computes a per-round + * function of b, c, d, and then mixes the result into and rotates the + * five registers a, b, c, d, e holding the intermediate results. + * + * The register rotation is implemented by rotating the arguments to + * the round macros instead of by explicit move instructions. + */ +.macro load index + mov (\index)*4(%rsi), %r10d + bswap %r10d + mov %r10d, (\index)*4(%rsp) +.endm + +.macro shuffle index + mov ((\index )&0xf)*4(%rsp), %r10d + xor ((\index- 3)&0xf)*4(%rsp), %r10d + xor ((\index- 8)&0xf)*4(%rsp), %r10d + xor ((\index-14)&0xf)*4(%rsp), %r10d + rol $1, %r10d + mov %r10d, ((\index)&0xf)*4(%rsp) +.endm + +.macro func1 a, b, c, d, e + mov \d, %r9d + xor \c, %r9d + and \b, %r9d + xor \d, %r9d +.endm + +.macro func2 a, b, c, d, e + mov \b, %r9d + xor \c, %r9d + xor \d, %r9d +.endm + +.macro func3 a, b, c, d, e + mov \b, %r8d + or \c, %r8d + and \d, %r8d + mov \b, %r9d + and \c, %r9d + or %r8d, %r9d +.endm + +.macro func4 a, b, c, d, e + func2 \a, \b, \c, \d, \e +.endm + +.macro mix a, b, c, d, e, const + rol $30, \b + add %r9d, \e + mov \a, %r8d + rol $5, %r8d + lea \const(\e, %r10d, 1), \e + add %r8d, \e +.endm + +.macro round1 a, b, c, d, e, index + load \index + func1 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x5a827999 +.endm + +.macro round1x a, b, c, d, e, index + shuffle \index + func1 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x5a827999 +.endm + +.macro round2 a, b, c, d, e, index + shuffle \index + func2 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x6ed9eba1 +.endm + +.macro round3 a, b, c, d, e, index + shuffle \index + func3 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x8f1bbcdc +.endm + +.macro round4 a, b, c, d, e, index + shuffle \index + func4 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0xca62c1d6 +.endm + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_scalar) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi // rdi: SHA1_CTX + sub $64+8, %rsp // 64 bytes for round keys + // plus alignment + + mov %rdi, %rbp + // rsi: buf + and $~63, %rdx // rdx: length in blocks + lea (%rsi, %rdx, 1), %rdi // rdi: end pointer + mov (%rbp), %eax // c->h0 + mov 4(%rbp), %ebx // c->h1 + mov 8(%rbp), %ecx // c->h2 + mov 12(%rbp), %edx // c->h3 + mov 16(%rbp), %ebp // c->h4 + + cmp %rsi, %rdi // any data to process? + je .Lend + +.Lloop: mov %eax, %r11d + mov %ebx, %r12d + mov %ecx, %r13d + mov %edx, %r14d + mov %ebp, %r15d + + round1 %eax, %ebx, %ecx, %edx, %ebp, 0 + round1 %ebp, %eax, %ebx, %ecx, %edx, 1 + round1 %edx, %ebp, %eax, %ebx, %ecx, 2 + round1 %ecx, %edx, %ebp, %eax, %ebx, 3 + round1 %ebx, %ecx, %edx, %ebp, %eax, 4 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 5 + round1 %ebp, %eax, %ebx, %ecx, %edx, 6 + round1 %edx, %ebp, %eax, %ebx, %ecx, 7 + round1 %ecx, %edx, %ebp, %eax, %ebx, 8 + round1 %ebx, %ecx, %edx, %ebp, %eax, 9 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 10 + round1 %ebp, %eax, %ebx, %ecx, %edx, 11 + round1 %edx, %ebp, %eax, %ebx, %ecx, 12 + round1 %ecx, %edx, %ebp, %eax, %ebx, 13 + round1 %ebx, %ecx, %edx, %ebp, %eax, 14 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 15 + round1x %ebp, %eax, %ebx, %ecx, %edx, 16 + round1x %edx, %ebp, %eax, %ebx, %ecx, 17 + round1x %ecx, %edx, %ebp, %eax, %ebx, 18 + round1x %ebx, %ecx, %edx, %ebp, %eax, 19 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 20 + round2 %ebp, %eax, %ebx, %ecx, %edx, 21 + round2 %edx, %ebp, %eax, %ebx, %ecx, 22 + round2 %ecx, %edx, %ebp, %eax, %ebx, 23 + round2 %ebx, %ecx, %edx, %ebp, %eax, 24 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 25 + round2 %ebp, %eax, %ebx, %ecx, %edx, 26 + round2 %edx, %ebp, %eax, %ebx, %ecx, 27 + round2 %ecx, %edx, %ebp, %eax, %ebx, 28 + round2 %ebx, %ecx, %edx, %ebp, %eax, 29 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 30 + round2 %ebp, %eax, %ebx, %ecx, %edx, 31 + round2 %edx, %ebp, %eax, %ebx, %ecx, 32 + round2 %ecx, %edx, %ebp, %eax, %ebx, 33 + round2 %ebx, %ecx, %edx, %ebp, %eax, 34 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 35 + round2 %ebp, %eax, %ebx, %ecx, %edx, 36 + round2 %edx, %ebp, %eax, %ebx, %ecx, 37 + round2 %ecx, %edx, %ebp, %eax, %ebx, 38 + round2 %ebx, %ecx, %edx, %ebp, %eax, 39 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 40 + round3 %ebp, %eax, %ebx, %ecx, %edx, 41 + round3 %edx, %ebp, %eax, %ebx, %ecx, 42 + round3 %ecx, %edx, %ebp, %eax, %ebx, 43 + round3 %ebx, %ecx, %edx, %ebp, %eax, 44 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 45 + round3 %ebp, %eax, %ebx, %ecx, %edx, 46 + round3 %edx, %ebp, %eax, %ebx, %ecx, 47 + round3 %ecx, %edx, %ebp, %eax, %ebx, 48 + round3 %ebx, %ecx, %edx, %ebp, %eax, 49 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 50 + round3 %ebp, %eax, %ebx, %ecx, %edx, 51 + round3 %edx, %ebp, %eax, %ebx, %ecx, 52 + round3 %ecx, %edx, %ebp, %eax, %ebx, 53 + round3 %ebx, %ecx, %edx, %ebp, %eax, 54 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 55 + round3 %ebp, %eax, %ebx, %ecx, %edx, 56 + round3 %edx, %ebp, %eax, %ebx, %ecx, 57 + round3 %ecx, %edx, %ebp, %eax, %ebx, 58 + round3 %ebx, %ecx, %edx, %ebp, %eax, 59 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 60 + round4 %ebp, %eax, %ebx, %ecx, %edx, 61 + round4 %edx, %ebp, %eax, %ebx, %ecx, 62 + round4 %ecx, %edx, %ebp, %eax, %ebx, 63 + round4 %ebx, %ecx, %edx, %ebp, %eax, 64 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 65 + round4 %ebp, %eax, %ebx, %ecx, %edx, 66 + round4 %edx, %ebp, %eax, %ebx, %ecx, 67 + round4 %ecx, %edx, %ebp, %eax, %ebx, 68 + round4 %ebx, %ecx, %edx, %ebp, %eax, 69 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 70 + round4 %ebp, %eax, %ebx, %ecx, %edx, 71 + round4 %edx, %ebp, %eax, %ebx, %ecx, 72 + round4 %ecx, %edx, %ebp, %eax, %ebx, 73 + round4 %ebx, %ecx, %edx, %ebp, %eax, 74 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 75 + round4 %ebp, %eax, %ebx, %ecx, %edx, 76 + round4 %edx, %ebp, %eax, %ebx, %ecx, 77 + round4 %ecx, %edx, %ebp, %eax, %ebx, 78 + round4 %ebx, %ecx, %edx, %ebp, %eax, 79 + + add %r11d, %eax + add %r12d, %ebx + add %r13d, %ecx + add %r14d, %edx + add %r15d, %ebp + + add $64, %rsi + cmp %rdi, %rsi + jb .Lloop + +.Lend: add $64+8, %rsp + pop %rdi // SHA1_CTX + mov %eax, (%rdi) + mov %ebx, 4(%rdi) + mov %ecx, 8(%rdi) + mov %edx, 12(%rdi) + mov %ebp, 16(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + ret +END(_libmd_sha1block_scalar) + +/* + * This is the implementation using AVX2, BMI1 and BMI2. It is based on: + * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" + * From http://software.intel.com/en-us/articles + * (look for improving-the-performance-of-the-secure-hash-algorithm-1) + * This implementation is 2x unrolled, and interleaves vector instructions, + * used to precompute W, with scalar computation of current round + * for optimal scheduling. + */ + + /* trivial helper macros */ +.macro update_hash a, tb, c, d, e + add (%r9), \a + mov \a, (%r9) + add 4(%r9), \tb + mov \tb, 4(%r9) + add 8(%r9), \c + mov \c, 8(%r9) + add 12(%r9), \d + mov \d, 12(%r9) + add 16(%r9), \e + mov \e, 16(%r9) +.endm + + /* help macros for recalc, which does precomputations */ +.macro precalc0 offset + vmovdqu \offset(%r10), %xmm0 +.endm + +.macro precalc1 offset + vinserti128 $1, \offset(%r13), %ymm0, %ymm0 +.endm + +.macro precalc2 yreg + vpshufb %ymm10, %ymm0, \yreg +.endm + +.macro precalc4 yreg, k_offset + vpaddd \k_offset(%r8), \yreg, %ymm0 +.endm + +.macro precalc7 offset + vmovdqu %ymm0, (\offset)*2(%r14) +.endm + +/* + * Message scheduling pre-compute for rounds 0-15 + * r13 is a pointer to the even 64-byte block + * r10 is a pointer to the odd 64-byte block + * r14 is a pointer to the temp buffer + * xmm0 is used as a temp register + * yreg is clobbered as part of the computation + * offset chooses a 16 byte chunk within a block + * r8 is a pointer to the constants block + * k_offset chooses K constants relevant to this round + * xmm10 holds the swap mask + */ +.macro precalc00_15 offset, yreg + precalc0 \offset + precalc1 \offset + precalc2 \yreg + precalc4 \yreg, 0 + precalc7 \offset +.endm + + /* helper macros for precalc16_31 */ +.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg + vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14] + vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3] +.endm + +.macro precalc17 reg_sub16, reg_sub8, reg + vpxor \reg_sub8, \reg, \reg + vpxor \reg_sub16, %ymm0, %ymm0 +.endm + +.macro precalc18 reg + vpxor %ymm0, \reg, \reg + vpslldq $12, \reg, %ymm9 +.endm + +.macro precalc19 reg + vpslld $1, \reg, %ymm0 + vpsrld $31, \reg, \reg + .endm + +.macro precalc20 reg + vpor \reg, %ymm0, %ymm0 + vpslld $2, %ymm9, \reg +.endm + +.macro precalc21 reg + vpsrld $30, %ymm9, %ymm9 + vpxor \reg, %ymm0, %ymm0 +.endm + +.macro precalc23 reg, k_offset, offset + vpxor %ymm9, %ymm0, \reg + vpaddd \k_offset(%r8), \reg, %ymm0 + vmovdqu %ymm0, (\offset)(%r14) +.endm + +/* + * Message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction. + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency. + + clobbers 5 input ymm registers REG_SUB* + * uses xmm0 and xmm9 as temp registers + * As always, r8 is a pointer to constants block + * and r14 is a pointer to temp buffer + */ +.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset + precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg + precalc17 \reg_sub16, \reg_sub8, \reg + precalc18 \reg + precalc19 \reg + precalc20 \reg + precalc21 \reg + precalc23 \reg, \k_offset, \offset +.endm + + /* helper macros for precalc_32_79 */ +.macro precalc32 reg_sub8, reg_sub4 + vpalignr $8, \reg_sub8, \reg_sub4, %ymm0 +.endm + +.macro precalc33 reg_sub28, reg + vpxor \reg_sub28, \reg, \reg +.endm + +.macro precalc34 reg_sub16 + vpxor \reg_sub16, %ymm0, %ymm0 +.endm + +.macro precalc35 reg + vpxor %ymm0, \reg, \reg +.endm + +.macro precalc36 reg + vpslld $2, \reg, %ymm0 +.endm + +.macro precalc37 reg + vpsrld $30, \reg, \reg + vpor \reg, %ymm0, \reg +.endm + +.macro precalc39 reg, k_offset, offset + vpaddd \k_offset(%r8), \reg, %ymm0 + vmovdqu %ymm0, \offset(%r14) +.endm + +.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset + precalc32 \reg_sub8, \reg_sub4 + precalc33 \reg_sub28, \reg + precalc34 \reg_sub16 + precalc35 \reg + precalc36 \reg + precalc37 \reg + precalc39 \reg, \k_offset, \offset +.endm + +.macro precalc + precalc00_15 0x00, %ymm15 + precalc00_15 0x10, %ymm14 + precalc00_15 0x20, %ymm13 + precalc00_15 0x30, %ymm12 + precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080 + precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0 + precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0 + precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0 + precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100 + precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120 + precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140 + precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160 + precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180 + precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0 + precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0 + precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0 + precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200 + precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220 + precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240 + precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260 +.endm + +/* + * Macros calculating individual rounds have general form + * calc_round_pre + precalc_round + calc_round_post + * calc_round_{pre,post} macros follow + */ +.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e + add \offset(%r15), \reg_e + andn \reg_c, \reg_a, %ebp + add \reg_b, \reg_e // add F from the previous round + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_b // for the next round +.endm + +/* + * Calculate F for the next round + */ +.macro calc_f1_post reg_a, reg_b, reg_e + and \reg_b, \reg_a // b & c + xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d) + add %r12d, \reg_e +.endm + +/* + * Registers are cyclically rotated: + * edx -> eax -> edi -> esi -> ebx -> ecx + */ +.macro calc0 + mov %esi, %ebx // precalculate first round + rorx $2, %esi, %esi + andn %eax, %ebx, %ebp + and %edi, %ebx + xor %ebp, %ebx + calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx + precalc0 0x80 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc1 + calc_f1_pre 0x4, %edx, %ecx, %esi, %eax + precalc1 0x80 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc2 + calc_f1_pre 0x8, %eax, %edx, %ebx, %edi + precalc2 %ymm15 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc3 + calc_f1_pre 0xc, %edi, %eax, %ecx, %esi + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc4 + calc_f1_pre 0x20, %esi, %edi, %edx, %ebx + precalc4 %ymm15, 0x0 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc5 + calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc6 + calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc7 + calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax + precalc7 0x0 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc8 + calc_f1_pre 0x40, %eax, %edx, %ebx, %edi + precalc0 0x90 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc9 + calc_f1_pre 0x44, %edi, %eax, %ecx, %esi + precalc1 0x90 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc10 + calc_f1_pre 0x48, %esi, %edi, %edx, %ebx + precalc2 %ymm14 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc11 + calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc12 + calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx + precalc4 %ymm14, 0 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc13 + calc_f1_pre 0x64, %edx, %ecx, %esi, %eax + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc14 + calc_f1_pre 0x68, %eax, %edx, %ebx, %edi + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc15 + calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi + precalc7 0x10 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc16 + calc_f1_pre 0x80, %esi, %edi, %edx, %ebx + precalc0 0xa0 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc17 + calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx + precalc1 0xa0 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc18 + calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx + precalc2 %ymm13 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc_f2_pre offset, reg_a, reg_b, reg_e + add \offset(%r15), \reg_e + add \reg_b, \reg_e // add F from the previous round + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_b // for next round +.endm + +.macro calc_f2_post reg_a, reg_b, reg_c, reg_e + xor \reg_b, \reg_a + add %r12d, \reg_e + xor \reg_c, \reg_a +.endm + +.macro calc19 + calc_f2_pre 0x8c, %edx, %ecx, %eax + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc20 + calc_f2_pre 0xa0, %eax, %edx, %edi + precalc4 %ymm13, 0x0 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc21 + calc_f2_pre 0xa4, %edi, %eax, %esi + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc22 + calc_f2_pre 0xa8, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc23 + calc_f2_pre 0xac, %ebx, %esi, %ecx + precalc7 0x20 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc24 + calc_f2_pre 0xc0, %ecx, %ebx, %edx + precalc0 0xb0 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc25 + calc_f2_pre 0xc4, %edx, %ecx, %eax + precalc1 0xb0 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc26 + calc_f2_pre 0xc8, %eax, %edx, %edi + precalc2 %ymm12 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc27 + calc_f2_pre 0xcc, %edi, %eax, %esi + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc28 + calc_f2_pre 0xe0, %esi, %edi, %ebx + precalc4 %ymm12, 0x0 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc29 + calc_f2_pre 0xe4, %ebx, %esi, %ecx + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc30 + calc_f2_pre 0xe8, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc31 + calc_f2_pre 0xec, %edx, %ecx, %eax + precalc7 0x30 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc32 + calc_f2_pre 0x100, %eax, %edx, %edi + precalc16 %ymm15, %ymm14, %ymm12, %ymm8 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc33 + calc_f2_pre 0x104, %edi, %eax, %esi + precalc17 %ymm15, %ymm13, %ymm8 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc34 + calc_f2_pre 0x108, %esi, %edi, %ebx + precalc18 %ymm8 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc35 + calc_f2_pre 0x10c, %ebx, %esi, %ecx + precalc19 %ymm8 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc36 + calc_f2_pre 0x120, %ecx, %ebx, %edx + precalc20 %ymm8 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc37 + calc_f2_pre 0x124, %edx, %ecx, %eax + precalc21 %ymm8 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc38 + calc_f2_pre 0x128, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc_f3_pre offset, reg_e + add \offset(%r15), \reg_e +.endm + +.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb + add \reg_tb, \reg_e // add F from the previous round + mov \reg_b, %ebp + or \reg_a, %ebp + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_tb + and \reg_c, %ebp // calculate F for the next round + and \reg_b, \reg_a + or %ebp, \reg_a + add %r12d, \reg_e +.endm + +.macro calc39 + calc_f3_pre 0x12c, %esi + precalc23 %ymm8, 0x0, 0x80 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc40 + calc_f3_pre 0x140, %ebx + precalc16 %ymm14, %ymm13, %ymm8, %ymm7 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc41 + calc_f3_pre 0x144, %ecx + precalc17 %ymm14, %ymm12, %ymm7 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc42 + calc_f3_pre 0x148, %edx + precalc18 %ymm7 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc43 + calc_f3_pre 0x14c, %eax + precalc19 %ymm7 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc44 + calc_f3_pre 0x160, %edi + precalc20 %ymm7 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc45 + calc_f3_pre 0x164, %esi + precalc21 %ymm7 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc46 + calc_f3_pre 0x168, %ebx + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc47 + calc_f3_pre 0x16c, %ecx + vpxor %ymm9, %ymm0, %ymm7 + vpaddd 0x20(%r8), %ymm7, %ymm0 + vmovdqu %ymm0, 0xa0(%r14) + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc48 + calc_f3_pre 0x180, %edx + precalc16 %ymm13, %ymm12, %ymm7, %ymm5 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc49 + calc_f3_pre 0x184, %eax + precalc17 %ymm13, %ymm8, %ymm5 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc50 + calc_f3_pre 0x188, %edi + precalc18 %ymm5 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc51 + calc_f3_pre 0x18c, %esi + precalc19 %ymm5 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc52 + calc_f3_pre 0x1a0, %ebx + precalc20 %ymm5 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc53 + calc_f3_pre 0x1a4, %ecx + precalc21 %ymm5 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc54 + calc_f3_pre 0x1a8, %edx + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc55 + calc_f3_pre 0x1ac, %eax + precalc23 %ymm5, 0x20, 0xc0 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc56 + calc_f3_pre 0x1c0, %edi + precalc16 %ymm12, %ymm8, %ymm5, %ymm3 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc57 + calc_f3_pre 0x1c4, %esi + precalc17 %ymm12, %ymm7, %ymm3 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc58 + calc_f3_pre 0x1c8, %ebx + precalc18 %ymm3 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc59 + calc_f2_pre 0x1cc, %ebx, %esi, %ecx + precalc19 %ymm3 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc60 + calc_f2_pre 0x1e0, %ecx, %ebx, %edx + precalc20 %ymm3 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc61 + calc_f2_pre 0x1e4, %edx, %ecx, %eax + precalc21 %ymm3 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc62 + calc_f2_pre 0x1e8, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc63 + calc_f2_pre 0x1ec, %edi, %eax, %esi + precalc23 %ymm3, 0x20, 0xe0 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc64 + calc_f2_pre 0x200, %esi, %edi, %ebx + precalc32 %ymm5, %ymm3 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc65 + calc_f2_pre 0x204, %ebx, %esi, %ecx + precalc33 %ymm14, %ymm15 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc66 + calc_f2_pre 0x208, %ecx, %ebx, %edx + precalc34 %ymm8 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc67 + calc_f2_pre 0x20c, %edx, %ecx, %eax + precalc35 %ymm15 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc68 + calc_f2_pre 0x220, %eax, %edx, %edi + precalc36 %ymm15 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc69 + calc_f2_pre 0x224, %edi, %eax, %esi + precalc37 %ymm15 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc70 + calc_f2_pre 0x228, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc71 + calc_f2_pre 0x22c, %ebx, %esi, %ecx + precalc39 %ymm15, 0x20, 0x100 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc72 + calc_f2_pre 0x240, %ecx, %ebx, %edx + precalc32 %ymm3, %ymm15 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc73 + calc_f2_pre 0x244, %edx, %ecx, %eax + precalc33 %ymm13, %ymm14 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc74 + calc_f2_pre 0x248, %eax, %edx, %edi + precalc34 %ymm7 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc75 + calc_f2_pre 0x24c, %edi, %eax, %esi + precalc35 %ymm14 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc76 + calc_f2_pre 0x260, %esi, %edi, %ebx + precalc36 %ymm14 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc77 + calc_f2_pre 0x264, %ebx, %esi, %ecx + precalc37 %ymm14 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc78 + calc_f2_pre 0x268, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc79 + add 0x26c(%r15), %eax + add %ecx, %eax + rorx $0x1b, %edx, %r12d + precalc39 %ymm14, 0x20, 0x120 + add %r12d, %eax +.endm + +/* + * Similar to calc0 + */ +.macro calc80 + mov %ecx, %edx // precalculate first round + rorx $2, %ecx, %ecx + andn %esi, %edx, %ebp + and %ebx, %edx + xor %ebp, %edx + calc_f1_pre 0x10, %eax, %edx, %ebx, %edi + precalc32 %ymm15, %ymm14 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc81 + calc_f1_pre 0x14, %edi, %eax, %ecx, %esi + precalc33 %ymm12, %ymm13 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc82 + calc_f1_pre 0x18, %esi, %edi, %edx, %ebx + precalc34 %ymm5 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc83 + calc_f1_pre 0x1c, %ebx, %esi, %eax, %ecx + precalc35 %ymm13 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc84 + calc_f1_pre 0x30, %ecx, %ebx, %edi, %edx + precalc36 %ymm13 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc85 + calc_f1_pre 0x34, %edx, %ecx, %esi, %eax + precalc37 %ymm13 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc86 + calc_f1_pre 0x38, %eax, %edx, %ebx, %edi + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc87 + calc_f1_pre 0x3c, %edi, %eax, %ecx, %esi + precalc39 %ymm13, 0x40, 0x140 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc88 + calc_f1_pre 0x50, %esi, %edi, %edx, %ebx + precalc32 %ymm14, %ymm13 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc89 + calc_f1_pre 0x54, %ebx, %esi, %eax, %ecx + precalc33 %ymm8, %ymm12 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc90 + calc_f1_pre 0x58, %ecx, %ebx, %edi, %edx + precalc34 %ymm3 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc91 + calc_f1_pre 0x5c, %edx, %ecx, %esi, %eax + precalc35 %ymm12 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc92 + calc_f1_pre 0x70, %eax, %edx, %ebx, %edi + precalc36 %ymm12 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc93 + calc_f1_pre 0x74, %edi, %eax, %ecx, %esi + precalc37 %ymm12 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc94 + calc_f1_pre 0x78, %esi, %edi, %edx, %ebx + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc95 + calc_f1_pre 0x7c, %ebx, %esi, %eax, %ecx + precalc39 %ymm12, 0x40, 0x160 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc96 + calc_f1_pre 0x90, %ecx, %ebx, %edi, %edx + precalc32 %ymm13, %ymm12 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc97 + calc_f1_pre 0x94, %edx, %ecx, %esi, %eax + precalc33 %ymm7, %ymm8 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc98 + calc_f1_pre 0x98, %eax, %edx, %ebx, %edi + precalc34 %ymm15 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc99 + calc_f2_pre 0x9c, %edi, %eax, %esi + precalc35 %ymm8 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc100 + calc_f2_pre 0xb0, %esi, %edi, %ebx + precalc36 %ymm8 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc101 + calc_f2_pre 0xb4, %ebx, %esi, %ecx + precalc37 %ymm8 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc102 + calc_f2_pre 0xb8, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc103 + calc_f2_pre 0xbc, %edx, %ecx, %eax + precalc39 %ymm8, 0x40, 0x180 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc104 + calc_f2_pre 0xd0, %eax, %edx, %edi + precalc32 %ymm12, %ymm8 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc105 + calc_f2_pre 0xd4, %edi, %eax, %esi + precalc33 %ymm5, %ymm7 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc106 + calc_f2_pre 0xd8, %esi, %edi, %ebx + precalc34 %ymm14 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc107 + calc_f2_pre 0xdc, %ebx, %esi, %ecx + precalc35 %ymm7 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc108 + calc_f2_pre 0xf0, %ecx, %ebx, %edx + precalc36 %ymm7 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc109 + calc_f2_pre 0xf4, %edx, %ecx, %eax + precalc37 %ymm7 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc110 + calc_f2_pre 0xf8, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc111 + calc_f2_pre 0xfc, %edi, %eax, %esi + precalc39 %ymm7, 0x40, 0x1a0 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc112 + calc_f2_pre 0x110, %esi, %edi, %ebx + precalc32 %ymm8, %ymm7 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc113 + calc_f2_pre 0x114, %ebx, %esi, %ecx + precalc33 %ymm3, %ymm5 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc114 + calc_f2_pre 0x118, %ecx, %ebx, %edx + precalc34 %ymm13 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc115 + calc_f2_pre 0x11c, %edx, %ecx, %eax + precalc35 %ymm5 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc116 + calc_f2_pre 0x130, %eax, %edx, %edi + precalc36 %ymm5 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc117 + calc_f2_pre 0x134, %edi, %eax, %esi + precalc37 %ymm5 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc118 + calc_f2_pre 0x138, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc119 + calc_f3_pre 0x13c, %ecx + precalc39 %ymm5, 0x40, 0x1c0 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc120 + calc_f3_pre 0x150, %edx + precalc32 %ymm7, %ymm5 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc121 + calc_f3_pre 0x154, %eax + precalc33 %ymm15, %ymm3 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc122 + calc_f3_pre 0x158, %edi + precalc34 %ymm12 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc123 + calc_f3_pre 0x15c, %esi + precalc35 %ymm3 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc124 + calc_f3_pre 0x170, %ebx + precalc36 %ymm3 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc125 + calc_f3_pre 0x174, %ecx + precalc37 %ymm3 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc126 + calc_f3_pre 0x178, %edx + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc127 + calc_f3_pre 0x17c, %eax + precalc39 %ymm3, 0x60, 0x1e0 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc128 + calc_f3_pre 0x190, %edi + precalc32 %ymm5, %ymm3 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc129 + calc_f3_pre 0x194, %esi + precalc33 %ymm14, %ymm15 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc130 + calc_f3_pre 0x198, %ebx + precalc34 %ymm8 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc131 + calc_f3_pre 0x19c, %ecx + precalc35 %ymm15 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc132 + calc_f3_pre 0x1b0, %edx + precalc36 %ymm15 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc133 + calc_f3_pre 0x1b4, %eax + precalc37 %ymm15 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc134 + calc_f3_pre 0x1b8, %edi + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc135 + calc_f3_pre 0x1bc, %esi + precalc39 %ymm15, 0x60, 0x200 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc136 + calc_f3_pre 0x1d0, %ebx + precalc32 %ymm3, %ymm15 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc137 + calc_f3_pre 0x1d4, %ecx + precalc33 %ymm13, %ymm14 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc138 + calc_f3_pre 0x1d8, %edx + precalc34 %ymm7 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc139 + calc_f2_pre 0x1dc, %edx, %ecx, %eax + precalc35 %ymm14 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc140 + calc_f2_pre 0x1f0, %eax, %edx, %edi + precalc36 %ymm14 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc141 + calc_f2_pre 0x1f4, %edi, %eax, %esi + precalc37 %ymm14 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc142 + calc_f2_pre 0x1f8, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc143 + calc_f2_pre 0x1fc, %ebx, %esi, %ecx + precalc39 %ymm14, 0x60, 0x220 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc144 + calc_f2_pre 0x210, %ecx, %ebx, %edx + precalc32 %ymm15, %ymm14 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc145 + calc_f2_pre 0x214, %edx, %ecx, %eax + precalc33 %ymm12, %ymm13 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc146 + calc_f2_pre 0x218, %eax, %edx, %edi + precalc34 %ymm5 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc147 + calc_f2_pre 0x21c, %edi, %eax, %esi + precalc35 %ymm13 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc148 + calc_f2_pre 0x230, %esi, %edi, %ebx + precalc36 %ymm13 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc149 + calc_f2_pre 0x234, %ebx, %esi, %ecx + precalc37 %ymm13 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc150 + calc_f2_pre 0x238, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc151 + calc_f2_pre 0x23c, %edx, %ecx, %eax + precalc39 %ymm13, 0x60, 0x240 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc152 + calc_f2_pre 0x250, %eax, %edx, %edi + precalc32 %ymm14, %ymm13 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc153 + calc_f2_pre 0x254, %edi, %eax, %esi + precalc33 %ymm8, %ymm12 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc154 + calc_f2_pre 0x258, %esi, %edi, %ebx + precalc34 %ymm3 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc155 + calc_f2_pre 0x25c, %ebx, %esi, %ecx + precalc35 %ymm12 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc156 + calc_f2_pre 0x270, %ecx, %ebx, %edx + precalc36 %ymm12 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc157 + calc_f2_pre 0x274, %edx, %ecx, %eax + precalc37 %ymm12 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc158 + calc_f2_pre 0x278, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc159 + add 0x27c(%r15), %esi + add %eax, %esi + rorx $0x1b, %edi, %r12d + precalc39 %ymm12, 0x60, 0x260 + add %r12d, %esi +.endm + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_avx2) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub $1408+8, %rsp + + and $~63, %rdx + lea k_xmm_ar(%rip), %r8 + mov %rdi, %r9 + mov %rsi, %r10 + lea 64(%rsi), %r13 + lea 64(%rsi, %rdx), %r11 + cmp %r11, %r13 + cmovae %r8, %r13 + vmovdqu bswap_shufb_ctl(%rip), %ymm10 + + mov (%r9), %ecx + mov 4(%r9), %esi + mov 8(%r9), %edi + mov 12(%r9), %eax + mov 16(%r9), %edx + mov %rsp, %r14 + lea 2*4*80+32(%rsp), %r15 + precalc // precalc WK for first 2 blocks + xchg %r14, %r15 + + // this is unrolled +.Loop: cmp %r8, %r10 // we use the value of R8 (set below) + // as a signal of the last block + jne .Lbegin + add $1408+8, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + vzeroupper + ret + +.Lbegin: + calc0 + calc1 + calc2 + calc3 + calc4 + calc5 + calc6 + calc7 + calc8 + calc9 + calc10 + calc11 + calc12 + calc13 + calc14 + calc15 + calc16 + calc17 + calc18 + calc19 + calc20 + calc21 + calc22 + calc23 + calc24 + calc25 + calc26 + calc27 + calc28 + calc29 + calc30 + calc31 + calc32 + calc33 + calc34 + calc35 + calc36 + calc37 + calc38 + calc39 + calc40 + calc41 + calc42 + calc43 + calc44 + calc45 + calc46 + calc47 + calc48 + calc49 + calc50 + calc51 + calc52 + calc53 + calc54 + calc55 + calc56 + calc57 + calc58 + calc59 + + add $128, %r10 // move to the next even-64-byte block + cmp %r11, %r10 // is the current block the last one? + cmovae %r8, %r10 // signal the last iteration smartly + + calc60 + calc61 + calc62 + calc63 + calc64 + calc65 + calc66 + calc67 + calc68 + calc69 + calc70 + calc71 + calc72 + calc73 + calc74 + calc75 + calc76 + calc77 + calc78 + calc79 + + update_hash %eax, %edx, %ebx, %esi, %edi + cmp %r8, %r10 // is the current block the last one? + je .Loop + mov %edx, %ecx + + calc80 + calc81 + calc82 + calc83 + calc84 + calc85 + calc86 + calc87 + calc88 + calc89 + calc90 + calc91 + calc92 + calc93 + calc94 + calc95 + calc96 + calc97 + calc98 + calc99 + calc100 + calc101 + calc102 + calc103 + calc104 + calc105 + calc106 + calc107 + calc108 + calc109 + calc110 + calc111 + calc112 + calc113 + calc114 + calc115 + calc116 + calc117 + calc118 + calc119 + calc120 + calc121 + calc122 + calc123 + calc124 + calc125 + calc126 + calc127 + calc128 + calc129 + calc130 + calc131 + calc132 + calc133 + calc134 + calc135 + calc136 + calc137 + calc138 + calc139 + + add $128, %r13 // move to the next even-64-byte block + cmp %r11, %r13 // is the current block the last one? + cmovae %r8, %r10 + + calc140 + calc141 + calc142 + calc143 + calc144 + calc145 + calc146 + calc147 + calc148 + calc149 + calc150 + calc151 + calc152 + calc153 + calc154 + calc155 + calc156 + calc157 + calc158 + calc159 + + update_hash %esi, %edi, %edx, %ecx, %ebx + mov %esi, %r12d // reset state for AVX2 reg permutation + mov %edi, %esi + mov %edx, %edi + mov %ebx, %edx + mov %ecx, %eax + mov %r12d, %ecx + xchg %r14, %r15 + jmp .Loop +END(_libmd_sha1block_avx2) + + .section .rodata + .balign 32 +k_xmm_ar: + .fill 8, 4, 0x5a827999 + .fill 8, 4, 0x6ed9eba1 + .fill 8, 4, 0x8f1bbcdc + .fill 8, 4, 0xca62c1d6 + .size k_xmm_ar, .-k_xmm_ar + +bswap_shufb_ctl: + .4byte 0x00010203 + .4byte 0x04050607 + .4byte 0x08090a0b + .4byte 0x0c0d0e0f + .4byte 0x00010203 + .4byte 0x04050607 + .4byte 0x08090a0b + .4byte 0x0c0d0e0f + .size bswap_shufb_ctl, .-bswap_shufb_ctl + + /* + * SHA1 implementation using the Intel SHA extensions (SHANI). + * + * Imlemented according to the Intel white paper + * + * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford, + * G. Wolrich: "Intel SHA Extensions: new instruction supporting + * the Secure Hash Algorithm on IntelĀ® architecture processors", + * July 2013. + */ + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_shani) + and $~63, %rdx // round length to block-size multiple + lea (%rsi, %rdx, 1), %rcx // end pointer + test %rdx, %rdx // nothing to do? + je 1f // if so, terminate immediately + + movdqu (%rdi), %xmm6 // h0, h1, h2, h3 + pxor %xmm7, %xmm7 + pshufd $0x1b, %xmm6, %xmm6 // h3, h2, h1, h0 + pinsrd $3, 16(%rdi), %xmm7 // h4 in the highest word of xmm7 + movdqu shuf_mask(%rip), %xmm4 + + // main loop +0: movdqa %xmm6, %xmm8 // stash ABCD + movdqa %xmm7, %xmm9 // stash E + + // rounds 0--3 + movdqu 0*16(%rsi), %xmm0 // load first message block + pshufb %xmm4, %xmm0 // and byte-swap + paddd %xmm0, %xmm7 // E += w[0] + movdqa %xmm6, %xmm5 // E' = A + sha1rnds4 $0, %xmm7, %xmm6 // perform rounds 0--3 + + // rounds 4--7 + movdqu 1*16(%rsi), %xmm1 + pshufb %xmm4, %xmm1 + sha1nexte %xmm1, %xmm5 + movdqa %xmm6, %xmm7 + sha1rnds4 $0, %xmm5, %xmm6 + sha1msg1 %xmm1, %xmm0 + + // rounds 8--11 + movdqu 2*16(%rsi), %xmm2 + pshufb %xmm4, %xmm2 + sha1nexte %xmm2, %xmm7 + movdqa %xmm6, %xmm5 + sha1rnds4 $0, %xmm7, %xmm6 + sha1msg1 %xmm2, %xmm1 + pxor %xmm2, %xmm0 + +.macro midround msg3, msg0, msg1, msg2, e1, e0, k + sha1nexte \msg3, \e1 + movdqa %xmm6, \e0 + sha1msg2 \msg3, \msg0 + sha1rnds4 $\k, \e1, %xmm6 + sha1msg1 \msg3, \msg2 + pxor \msg3, \msg1 +.endm + + movdqu 3*16(%rsi), %xmm3 // load third message block + pshufb %xmm4, %xmm3 + + add $4*16, %rsi + + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0 // 12--15 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0 // 16--19 + midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 20--23 + midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1 // 24--27 + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1 // 28--31 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1 // 32--35 + midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 36--39 + midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 40--43 + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2 // 44--47 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2 // 48--51 + midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2 // 52--55 + midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 56--59 + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3 // 60--63 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3 // 64--67 + + // rounds 68--71 + sha1nexte %xmm1, %xmm5 + movdqa %xmm6, %xmm7 + sha1msg2 %xmm1, %xmm2 + sha1rnds4 $3, %xmm5, %xmm6 + pxor %xmm1, %xmm3 + + // rounds 72--75 + sha1nexte %xmm2, %xmm7 + movdqa %xmm6, %xmm5 + sha1msg2 %xmm2, %xmm3 + sha1rnds4 $3, %xmm7, %xmm6 + + // rounds 76--79 + sha1nexte %xmm3, %xmm5 + movdqa %xmm6, %xmm7 + sha1rnds4 $3, %xmm5, %xmm6 + + sha1nexte %xmm9, %xmm7 // add saved E + paddd %xmm8, %xmm6 // add saved ABCD + + cmp %rsi, %rcx // end reached? + jne 0b + + pshufd $0x1b, %xmm6, %xmm6 // restore order of h0--h3 + movdqu %xmm6, (%rdi) // write h0--h3 + pextrd $3, %xmm7, 16(%rdi) // write h4 +1: ret +END(_libmd_sha1block_shani) + + .section .rodata + .balign 16 +shuf_mask: + .8byte 0x08090a0b0c0d0e0f + .8byte 0x0001020304050607 + .size shuf_mask, .-shuf_mask + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/amd64/sha1dispatch.c b/lib/libmd/amd64/sha1dispatch.c new file mode 100644 index 000000000000..86509195d56e --- /dev/null +++ b/lib/libmd/amd64/sha1dispatch.c @@ -0,0 +1,77 @@ +/*- + * Copyright (c) 2016 The Go Authors. All rights reserved. + * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org> + * + * Adapted from Go's crypto/sha1/sha1block_amd64.go. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <machine/specialreg.h> +#include <sha.h> +#include <x86/ifunc.h> + +extern void _libmd_sha1block_scalar(SHA1_CTX *, const void *, size_t); +extern void _libmd_sha1block_avx2(SHA1_CTX *, const void *, size_t); +extern void _libmd_sha1block_shani(SHA1_CTX *, const void *, size_t); +static void sha1block_avx2_wrapper(SHA1_CTX *, const void *, size_t); + +#define AVX2_STDEXT_NEEDED \ + (CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2) + +DEFINE_UIFUNC(, void, sha1_block, (SHA1_CTX *, const void *, size_t)) +{ + if (cpu_stdext_feature & CPUID_STDEXT_SHA) + return (_libmd_sha1block_shani); + if ((cpu_stdext_feature & AVX2_STDEXT_NEEDED) == AVX2_STDEXT_NEEDED) + return (sha1block_avx2_wrapper); + else + return (_libmd_sha1block_scalar); +} + +static void +sha1block_avx2_wrapper(SHA1_CTX *c, const void *data, size_t len) +{ + if (len >= 256) { + /* + * sha1block_avx2 calculates sha1 for 2 block per iteration. + * It also interleaves the precalculation for next the block. + * So it may read up-to 192 bytes past the end of p. + * We may add checks inside sha1block_avx2, but this will + * just turn it into a copy of sha1block_scalar, + * so call it directly, instead. + */ + size_t safe_len = len - 128; + + if (safe_len % 128 != 0) + safe_len -= 64; + + _libmd_sha1block_avx2(c, data, safe_len); + _libmd_sha1block_scalar(c, data + safe_len, len - safe_len); + } else + _libmd_sha1block_scalar(c, data, len); +} |