aboutsummaryrefslogtreecommitdiff
path: root/lib/libmd/amd64
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libmd/amd64')
-rw-r--r--lib/libmd/amd64/sha1block.S1851
-rw-r--r--lib/libmd/amd64/sha1dispatch.c77
2 files changed, 1928 insertions, 0 deletions
diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S
new file mode 100644
index 000000000000..f1291ef2647a
--- /dev/null
+++ b/lib/libmd/amd64/sha1block.S
@@ -0,0 +1,1851 @@
+/*-
+ * Copyright (c) 2013 The Go Authors. All rights reserved.
+ * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ *
+ * Adapted from Go's crypto/sha1/sha1block_amd64.s.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/asm.h>
+
+/*
+ * SHA-1 block routine. See sha1c.c for C equivalent.
+ *
+ * There are 80 rounds of 4 types:
+ * - rounds 0-15 are type 1 and load data (round1 macro).
+ * - rounds 16-19 are type 1 and do not load data (round1x macro).
+ * - rounds 20-39 are type 2 and do not load data (round2 macro).
+ * - rounds 40-59 are type 3 and do not load data (round3 macro).
+ * - rounds 60-79 are type 4 and do not load data (round4 macro).
+ *
+ * Each round loads or shuffles the data, then computes a per-round
+ * function of b, c, d, and then mixes the result into and rotates the
+ * five registers a, b, c, d, e holding the intermediate results.
+ *
+ * The register rotation is implemented by rotating the arguments to
+ * the round macros instead of by explicit move instructions.
+ */
+.macro load index
+ mov (\index)*4(%rsi), %r10d
+ bswap %r10d
+ mov %r10d, (\index)*4(%rsp)
+.endm
+
+.macro shuffle index
+ mov ((\index )&0xf)*4(%rsp), %r10d
+ xor ((\index- 3)&0xf)*4(%rsp), %r10d
+ xor ((\index- 8)&0xf)*4(%rsp), %r10d
+ xor ((\index-14)&0xf)*4(%rsp), %r10d
+ rol $1, %r10d
+ mov %r10d, ((\index)&0xf)*4(%rsp)
+.endm
+
+.macro func1 a, b, c, d, e
+ mov \d, %r9d
+ xor \c, %r9d
+ and \b, %r9d
+ xor \d, %r9d
+.endm
+
+.macro func2 a, b, c, d, e
+ mov \b, %r9d
+ xor \c, %r9d
+ xor \d, %r9d
+.endm
+
+.macro func3 a, b, c, d, e
+ mov \b, %r8d
+ or \c, %r8d
+ and \d, %r8d
+ mov \b, %r9d
+ and \c, %r9d
+ or %r8d, %r9d
+.endm
+
+.macro func4 a, b, c, d, e
+ func2 \a, \b, \c, \d, \e
+.endm
+
+.macro mix a, b, c, d, e, const
+ rol $30, \b
+ add %r9d, \e
+ mov \a, %r8d
+ rol $5, %r8d
+ lea \const(\e, %r10d, 1), \e
+ add %r8d, \e
+.endm
+
+.macro round1 a, b, c, d, e, index
+ load \index
+ func1 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x5a827999
+.endm
+
+.macro round1x a, b, c, d, e, index
+ shuffle \index
+ func1 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x5a827999
+.endm
+
+.macro round2 a, b, c, d, e, index
+ shuffle \index
+ func2 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x6ed9eba1
+.endm
+
+.macro round3 a, b, c, d, e, index
+ shuffle \index
+ func3 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0x8f1bbcdc
+.endm
+
+.macro round4 a, b, c, d, e, index
+ shuffle \index
+ func4 \a, \b, \c, \d, \e
+ mix \a, \b, \c, \d, \e, 0xca62c1d6
+.endm
+
+ // sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_scalar)
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ push %rdi // rdi: SHA1_CTX
+ sub $64+8, %rsp // 64 bytes for round keys
+ // plus alignment
+
+ mov %rdi, %rbp
+ // rsi: buf
+ and $~63, %rdx // rdx: length in blocks
+ lea (%rsi, %rdx, 1), %rdi // rdi: end pointer
+ mov (%rbp), %eax // c->h0
+ mov 4(%rbp), %ebx // c->h1
+ mov 8(%rbp), %ecx // c->h2
+ mov 12(%rbp), %edx // c->h3
+ mov 16(%rbp), %ebp // c->h4
+
+ cmp %rsi, %rdi // any data to process?
+ je .Lend
+
+.Lloop: mov %eax, %r11d
+ mov %ebx, %r12d
+ mov %ecx, %r13d
+ mov %edx, %r14d
+ mov %ebp, %r15d
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 0
+ round1 %ebp, %eax, %ebx, %ecx, %edx, 1
+ round1 %edx, %ebp, %eax, %ebx, %ecx, 2
+ round1 %ecx, %edx, %ebp, %eax, %ebx, 3
+ round1 %ebx, %ecx, %edx, %ebp, %eax, 4
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 5
+ round1 %ebp, %eax, %ebx, %ecx, %edx, 6
+ round1 %edx, %ebp, %eax, %ebx, %ecx, 7
+ round1 %ecx, %edx, %ebp, %eax, %ebx, 8
+ round1 %ebx, %ecx, %edx, %ebp, %eax, 9
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 10
+ round1 %ebp, %eax, %ebx, %ecx, %edx, 11
+ round1 %edx, %ebp, %eax, %ebx, %ecx, 12
+ round1 %ecx, %edx, %ebp, %eax, %ebx, 13
+ round1 %ebx, %ecx, %edx, %ebp, %eax, 14
+
+ round1 %eax, %ebx, %ecx, %edx, %ebp, 15
+ round1x %ebp, %eax, %ebx, %ecx, %edx, 16
+ round1x %edx, %ebp, %eax, %ebx, %ecx, 17
+ round1x %ecx, %edx, %ebp, %eax, %ebx, 18
+ round1x %ebx, %ecx, %edx, %ebp, %eax, 19
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 20
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 21
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 22
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 23
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 24
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 25
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 26
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 27
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 28
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 29
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 30
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 31
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 32
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 33
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 34
+
+ round2 %eax, %ebx, %ecx, %edx, %ebp, 35
+ round2 %ebp, %eax, %ebx, %ecx, %edx, 36
+ round2 %edx, %ebp, %eax, %ebx, %ecx, 37
+ round2 %ecx, %edx, %ebp, %eax, %ebx, 38
+ round2 %ebx, %ecx, %edx, %ebp, %eax, 39
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 40
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 41
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 42
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 43
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 44
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 45
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 46
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 47
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 48
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 49
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 50
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 51
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 52
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 53
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 54
+
+ round3 %eax, %ebx, %ecx, %edx, %ebp, 55
+ round3 %ebp, %eax, %ebx, %ecx, %edx, 56
+ round3 %edx, %ebp, %eax, %ebx, %ecx, 57
+ round3 %ecx, %edx, %ebp, %eax, %ebx, 58
+ round3 %ebx, %ecx, %edx, %ebp, %eax, 59
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 60
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 61
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 62
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 63
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 64
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 65
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 66
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 67
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 68
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 69
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 70
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 71
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 72
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 73
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 74
+
+ round4 %eax, %ebx, %ecx, %edx, %ebp, 75
+ round4 %ebp, %eax, %ebx, %ecx, %edx, 76
+ round4 %edx, %ebp, %eax, %ebx, %ecx, 77
+ round4 %ecx, %edx, %ebp, %eax, %ebx, 78
+ round4 %ebx, %ecx, %edx, %ebp, %eax, 79
+
+ add %r11d, %eax
+ add %r12d, %ebx
+ add %r13d, %ecx
+ add %r14d, %edx
+ add %r15d, %ebp
+
+ add $64, %rsi
+ cmp %rdi, %rsi
+ jb .Lloop
+
+.Lend: add $64+8, %rsp
+ pop %rdi // SHA1_CTX
+ mov %eax, (%rdi)
+ mov %ebx, 4(%rdi)
+ mov %ecx, 8(%rdi)
+ mov %edx, 12(%rdi)
+ mov %ebp, 16(%rdi)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+END(_libmd_sha1block_scalar)
+
+/*
+ * This is the implementation using AVX2, BMI1 and BMI2. It is based on:
+ * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
+ * From http://software.intel.com/en-us/articles
+ * (look for improving-the-performance-of-the-secure-hash-algorithm-1)
+ * This implementation is 2x unrolled, and interleaves vector instructions,
+ * used to precompute W, with scalar computation of current round
+ * for optimal scheduling.
+ */
+
+ /* trivial helper macros */
+.macro update_hash a, tb, c, d, e
+ add (%r9), \a
+ mov \a, (%r9)
+ add 4(%r9), \tb
+ mov \tb, 4(%r9)
+ add 8(%r9), \c
+ mov \c, 8(%r9)
+ add 12(%r9), \d
+ mov \d, 12(%r9)
+ add 16(%r9), \e
+ mov \e, 16(%r9)
+.endm
+
+ /* help macros for recalc, which does precomputations */
+.macro precalc0 offset
+ vmovdqu \offset(%r10), %xmm0
+.endm
+
+.macro precalc1 offset
+ vinserti128 $1, \offset(%r13), %ymm0, %ymm0
+.endm
+
+.macro precalc2 yreg
+ vpshufb %ymm10, %ymm0, \yreg
+.endm
+
+.macro precalc4 yreg, k_offset
+ vpaddd \k_offset(%r8), \yreg, %ymm0
+.endm
+
+.macro precalc7 offset
+ vmovdqu %ymm0, (\offset)*2(%r14)
+.endm
+
+/*
+ * Message scheduling pre-compute for rounds 0-15
+ * r13 is a pointer to the even 64-byte block
+ * r10 is a pointer to the odd 64-byte block
+ * r14 is a pointer to the temp buffer
+ * xmm0 is used as a temp register
+ * yreg is clobbered as part of the computation
+ * offset chooses a 16 byte chunk within a block
+ * r8 is a pointer to the constants block
+ * k_offset chooses K constants relevant to this round
+ * xmm10 holds the swap mask
+ */
+.macro precalc00_15 offset, yreg
+ precalc0 \offset
+ precalc1 \offset
+ precalc2 \yreg
+ precalc4 \yreg, 0
+ precalc7 \offset
+.endm
+
+ /* helper macros for precalc16_31 */
+.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg
+ vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14]
+ vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3]
+.endm
+
+.macro precalc17 reg_sub16, reg_sub8, reg
+ vpxor \reg_sub8, \reg, \reg
+ vpxor \reg_sub16, %ymm0, %ymm0
+.endm
+
+.macro precalc18 reg
+ vpxor %ymm0, \reg, \reg
+ vpslldq $12, \reg, %ymm9
+.endm
+
+.macro precalc19 reg
+ vpslld $1, \reg, %ymm0
+ vpsrld $31, \reg, \reg
+ .endm
+
+.macro precalc20 reg
+ vpor \reg, %ymm0, %ymm0
+ vpslld $2, %ymm9, \reg
+.endm
+
+.macro precalc21 reg
+ vpsrld $30, %ymm9, %ymm9
+ vpxor \reg, %ymm0, %ymm0
+.endm
+
+.macro precalc23 reg, k_offset, offset
+ vpxor %ymm9, %ymm0, \reg
+ vpaddd \k_offset(%r8), \reg, %ymm0
+ vmovdqu %ymm0, (\offset)(%r14)
+.endm
+
+/*
+ * Message scheduling pre-compute for rounds 16-31
+ * calculating last 32 w[i] values in 8 XMM registers
+ * pre-calculate K+w[i] values and store to mem
+ * for later load by ALU add instruction.
+ * "brute force" vectorization for rounds 16-31 only
+ * due to w[i]->w[i-3] dependency.
+ + clobbers 5 input ymm registers REG_SUB*
+ * uses xmm0 and xmm9 as temp registers
+ * As always, r8 is a pointer to constants block
+ * and r14 is a pointer to temp buffer
+ */
+.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset
+ precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg
+ precalc17 \reg_sub16, \reg_sub8, \reg
+ precalc18 \reg
+ precalc19 \reg
+ precalc20 \reg
+ precalc21 \reg
+ precalc23 \reg, \k_offset, \offset
+.endm
+
+ /* helper macros for precalc_32_79 */
+.macro precalc32 reg_sub8, reg_sub4
+ vpalignr $8, \reg_sub8, \reg_sub4, %ymm0
+.endm
+
+.macro precalc33 reg_sub28, reg
+ vpxor \reg_sub28, \reg, \reg
+.endm
+
+.macro precalc34 reg_sub16
+ vpxor \reg_sub16, %ymm0, %ymm0
+.endm
+
+.macro precalc35 reg
+ vpxor %ymm0, \reg, \reg
+.endm
+
+.macro precalc36 reg
+ vpslld $2, \reg, %ymm0
+.endm
+
+.macro precalc37 reg
+ vpsrld $30, \reg, \reg
+ vpor \reg, %ymm0, \reg
+.endm
+
+.macro precalc39 reg, k_offset, offset
+ vpaddd \k_offset(%r8), \reg, %ymm0
+ vmovdqu %ymm0, \offset(%r14)
+.endm
+
+.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset
+ precalc32 \reg_sub8, \reg_sub4
+ precalc33 \reg_sub28, \reg
+ precalc34 \reg_sub16
+ precalc35 \reg
+ precalc36 \reg
+ precalc37 \reg
+ precalc39 \reg, \k_offset, \offset
+.endm
+
+.macro precalc
+ precalc00_15 0x00, %ymm15
+ precalc00_15 0x10, %ymm14
+ precalc00_15 0x20, %ymm13
+ precalc00_15 0x30, %ymm12
+ precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080
+ precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0
+ precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0
+ precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0
+ precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100
+ precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120
+ precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140
+ precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160
+ precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180
+ precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0
+ precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0
+ precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0
+ precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200
+ precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220
+ precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240
+ precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260
+.endm
+
+/*
+ * Macros calculating individual rounds have general form
+ * calc_round_pre + precalc_round + calc_round_post
+ * calc_round_{pre,post} macros follow
+ */
+.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e
+ add \offset(%r15), \reg_e
+ andn \reg_c, \reg_a, %ebp
+ add \reg_b, \reg_e // add F from the previous round
+ rorx $0x1b, \reg_a, %r12d
+ rorx $2, \reg_a, \reg_b // for the next round
+.endm
+
+/*
+ * Calculate F for the next round
+ */
+.macro calc_f1_post reg_a, reg_b, reg_e
+ and \reg_b, \reg_a // b & c
+ xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d)
+ add %r12d, \reg_e
+.endm
+
+/*
+ * Registers are cyclically rotated:
+ * edx -> eax -> edi -> esi -> ebx -> ecx
+ */
+.macro calc0
+ mov %esi, %ebx // precalculate first round
+ rorx $2, %esi, %esi
+ andn %eax, %ebx, %ebp
+ and %edi, %ebx
+ xor %ebp, %ebx
+ calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx
+ precalc0 0x80
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc1
+ calc_f1_pre 0x4, %edx, %ecx, %esi, %eax
+ precalc1 0x80
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc2
+ calc_f1_pre 0x8, %eax, %edx, %ebx, %edi
+ precalc2 %ymm15
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc3
+ calc_f1_pre 0xc, %edi, %eax, %ecx, %esi
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc4
+ calc_f1_pre 0x20, %esi, %edi, %edx, %ebx
+ precalc4 %ymm15, 0x0
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc5
+ calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc6
+ calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc7
+ calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax
+ precalc7 0x0
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc8
+ calc_f1_pre 0x40, %eax, %edx, %ebx, %edi
+ precalc0 0x90
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc9
+ calc_f1_pre 0x44, %edi, %eax, %ecx, %esi
+ precalc1 0x90
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc10
+ calc_f1_pre 0x48, %esi, %edi, %edx, %ebx
+ precalc2 %ymm14
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc11
+ calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc12
+ calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx
+ precalc4 %ymm14, 0
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc13
+ calc_f1_pre 0x64, %edx, %ecx, %esi, %eax
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc14
+ calc_f1_pre 0x68, %eax, %edx, %ebx, %edi
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc15
+ calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi
+ precalc7 0x10
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc16
+ calc_f1_pre 0x80, %esi, %edi, %edx, %ebx
+ precalc0 0xa0
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc17
+ calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx
+ precalc1 0xa0
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc18
+ calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx
+ precalc2 %ymm13
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc_f2_pre offset, reg_a, reg_b, reg_e
+ add \offset(%r15), \reg_e
+ add \reg_b, \reg_e // add F from the previous round
+ rorx $0x1b, \reg_a, %r12d
+ rorx $2, \reg_a, \reg_b // for next round
+.endm
+
+.macro calc_f2_post reg_a, reg_b, reg_c, reg_e
+ xor \reg_b, \reg_a
+ add %r12d, \reg_e
+ xor \reg_c, \reg_a
+.endm
+
+.macro calc19
+ calc_f2_pre 0x8c, %edx, %ecx, %eax
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc20
+ calc_f2_pre 0xa0, %eax, %edx, %edi
+ precalc4 %ymm13, 0x0
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc21
+ calc_f2_pre 0xa4, %edi, %eax, %esi
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc22
+ calc_f2_pre 0xa8, %esi, %edi, %ebx
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc23
+ calc_f2_pre 0xac, %ebx, %esi, %ecx
+ precalc7 0x20
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc24
+ calc_f2_pre 0xc0, %ecx, %ebx, %edx
+ precalc0 0xb0
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc25
+ calc_f2_pre 0xc4, %edx, %ecx, %eax
+ precalc1 0xb0
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc26
+ calc_f2_pre 0xc8, %eax, %edx, %edi
+ precalc2 %ymm12
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc27
+ calc_f2_pre 0xcc, %edi, %eax, %esi
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc28
+ calc_f2_pre 0xe0, %esi, %edi, %ebx
+ precalc4 %ymm12, 0x0
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc29
+ calc_f2_pre 0xe4, %ebx, %esi, %ecx
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc30
+ calc_f2_pre 0xe8, %ecx, %ebx, %edx
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc31
+ calc_f2_pre 0xec, %edx, %ecx, %eax
+ precalc7 0x30
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc32
+ calc_f2_pre 0x100, %eax, %edx, %edi
+ precalc16 %ymm15, %ymm14, %ymm12, %ymm8
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc33
+ calc_f2_pre 0x104, %edi, %eax, %esi
+ precalc17 %ymm15, %ymm13, %ymm8
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc34
+ calc_f2_pre 0x108, %esi, %edi, %ebx
+ precalc18 %ymm8
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc35
+ calc_f2_pre 0x10c, %ebx, %esi, %ecx
+ precalc19 %ymm8
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc36
+ calc_f2_pre 0x120, %ecx, %ebx, %edx
+ precalc20 %ymm8
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc37
+ calc_f2_pre 0x124, %edx, %ecx, %eax
+ precalc21 %ymm8
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc38
+ calc_f2_pre 0x128, %eax, %edx, %edi
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc_f3_pre offset, reg_e
+ add \offset(%r15), \reg_e
+.endm
+
+.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb
+ add \reg_tb, \reg_e // add F from the previous round
+ mov \reg_b, %ebp
+ or \reg_a, %ebp
+ rorx $0x1b, \reg_a, %r12d
+ rorx $2, \reg_a, \reg_tb
+ and \reg_c, %ebp // calculate F for the next round
+ and \reg_b, \reg_a
+ or %ebp, \reg_a
+ add %r12d, \reg_e
+.endm
+
+.macro calc39
+ calc_f3_pre 0x12c, %esi
+ precalc23 %ymm8, 0x0, 0x80
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc40
+ calc_f3_pre 0x140, %ebx
+ precalc16 %ymm14, %ymm13, %ymm8, %ymm7
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc41
+ calc_f3_pre 0x144, %ecx
+ precalc17 %ymm14, %ymm12, %ymm7
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc42
+ calc_f3_pre 0x148, %edx
+ precalc18 %ymm7
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc43
+ calc_f3_pre 0x14c, %eax
+ precalc19 %ymm7
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc44
+ calc_f3_pre 0x160, %edi
+ precalc20 %ymm7
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc45
+ calc_f3_pre 0x164, %esi
+ precalc21 %ymm7
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc46
+ calc_f3_pre 0x168, %ebx
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc47
+ calc_f3_pre 0x16c, %ecx
+ vpxor %ymm9, %ymm0, %ymm7
+ vpaddd 0x20(%r8), %ymm7, %ymm0
+ vmovdqu %ymm0, 0xa0(%r14)
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc48
+ calc_f3_pre 0x180, %edx
+ precalc16 %ymm13, %ymm12, %ymm7, %ymm5
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc49
+ calc_f3_pre 0x184, %eax
+ precalc17 %ymm13, %ymm8, %ymm5
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc50
+ calc_f3_pre 0x188, %edi
+ precalc18 %ymm5
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc51
+ calc_f3_pre 0x18c, %esi
+ precalc19 %ymm5
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc52
+ calc_f3_pre 0x1a0, %ebx
+ precalc20 %ymm5
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc53
+ calc_f3_pre 0x1a4, %ecx
+ precalc21 %ymm5
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc54
+ calc_f3_pre 0x1a8, %edx
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc55
+ calc_f3_pre 0x1ac, %eax
+ precalc23 %ymm5, 0x20, 0xc0
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc56
+ calc_f3_pre 0x1c0, %edi
+ precalc16 %ymm12, %ymm8, %ymm5, %ymm3
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc57
+ calc_f3_pre 0x1c4, %esi
+ precalc17 %ymm12, %ymm7, %ymm3
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc58
+ calc_f3_pre 0x1c8, %ebx
+ precalc18 %ymm3
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc59
+ calc_f2_pre 0x1cc, %ebx, %esi, %ecx
+ precalc19 %ymm3
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc60
+ calc_f2_pre 0x1e0, %ecx, %ebx, %edx
+ precalc20 %ymm3
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc61
+ calc_f2_pre 0x1e4, %edx, %ecx, %eax
+ precalc21 %ymm3
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc62
+ calc_f2_pre 0x1e8, %eax, %edx, %edi
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc63
+ calc_f2_pre 0x1ec, %edi, %eax, %esi
+ precalc23 %ymm3, 0x20, 0xe0
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc64
+ calc_f2_pre 0x200, %esi, %edi, %ebx
+ precalc32 %ymm5, %ymm3
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc65
+ calc_f2_pre 0x204, %ebx, %esi, %ecx
+ precalc33 %ymm14, %ymm15
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc66
+ calc_f2_pre 0x208, %ecx, %ebx, %edx
+ precalc34 %ymm8
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc67
+ calc_f2_pre 0x20c, %edx, %ecx, %eax
+ precalc35 %ymm15
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc68
+ calc_f2_pre 0x220, %eax, %edx, %edi
+ precalc36 %ymm15
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc69
+ calc_f2_pre 0x224, %edi, %eax, %esi
+ precalc37 %ymm15
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc70
+ calc_f2_pre 0x228, %esi, %edi, %ebx
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc71
+ calc_f2_pre 0x22c, %ebx, %esi, %ecx
+ precalc39 %ymm15, 0x20, 0x100
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc72
+ calc_f2_pre 0x240, %ecx, %ebx, %edx
+ precalc32 %ymm3, %ymm15
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc73
+ calc_f2_pre 0x244, %edx, %ecx, %eax
+ precalc33 %ymm13, %ymm14
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc74
+ calc_f2_pre 0x248, %eax, %edx, %edi
+ precalc34 %ymm7
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc75
+ calc_f2_pre 0x24c, %edi, %eax, %esi
+ precalc35 %ymm14
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc76
+ calc_f2_pre 0x260, %esi, %edi, %ebx
+ precalc36 %ymm14
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc77
+ calc_f2_pre 0x264, %ebx, %esi, %ecx
+ precalc37 %ymm14
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc78
+ calc_f2_pre 0x268, %ecx, %ebx, %edx
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc79
+ add 0x26c(%r15), %eax
+ add %ecx, %eax
+ rorx $0x1b, %edx, %r12d
+ precalc39 %ymm14, 0x20, 0x120
+ add %r12d, %eax
+.endm
+
+/*
+ * Similar to calc0
+ */
+.macro calc80
+ mov %ecx, %edx // precalculate first round
+ rorx $2, %ecx, %ecx
+ andn %esi, %edx, %ebp
+ and %ebx, %edx
+ xor %ebp, %edx
+ calc_f1_pre 0x10, %eax, %edx, %ebx, %edi
+ precalc32 %ymm15, %ymm14
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc81
+ calc_f1_pre 0x14, %edi, %eax, %ecx, %esi
+ precalc33 %ymm12, %ymm13
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc82
+ calc_f1_pre 0x18, %esi, %edi, %edx, %ebx
+ precalc34 %ymm5
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc83
+ calc_f1_pre 0x1c, %ebx, %esi, %eax, %ecx
+ precalc35 %ymm13
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc84
+ calc_f1_pre 0x30, %ecx, %ebx, %edi, %edx
+ precalc36 %ymm13
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc85
+ calc_f1_pre 0x34, %edx, %ecx, %esi, %eax
+ precalc37 %ymm13
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc86
+ calc_f1_pre 0x38, %eax, %edx, %ebx, %edi
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc87
+ calc_f1_pre 0x3c, %edi, %eax, %ecx, %esi
+ precalc39 %ymm13, 0x40, 0x140
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc88
+ calc_f1_pre 0x50, %esi, %edi, %edx, %ebx
+ precalc32 %ymm14, %ymm13
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc89
+ calc_f1_pre 0x54, %ebx, %esi, %eax, %ecx
+ precalc33 %ymm8, %ymm12
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc90
+ calc_f1_pre 0x58, %ecx, %ebx, %edi, %edx
+ precalc34 %ymm3
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc91
+ calc_f1_pre 0x5c, %edx, %ecx, %esi, %eax
+ precalc35 %ymm12
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc92
+ calc_f1_pre 0x70, %eax, %edx, %ebx, %edi
+ precalc36 %ymm12
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc93
+ calc_f1_pre 0x74, %edi, %eax, %ecx, %esi
+ precalc37 %ymm12
+ calc_f1_post %edi, %edx, %esi
+.endm
+
+.macro calc94
+ calc_f1_pre 0x78, %esi, %edi, %edx, %ebx
+ calc_f1_post %esi, %eax, %ebx
+.endm
+
+.macro calc95
+ calc_f1_pre 0x7c, %ebx, %esi, %eax, %ecx
+ precalc39 %ymm12, 0x40, 0x160
+ calc_f1_post %ebx, %edi, %ecx
+.endm
+
+.macro calc96
+ calc_f1_pre 0x90, %ecx, %ebx, %edi, %edx
+ precalc32 %ymm13, %ymm12
+ calc_f1_post %ecx, %esi, %edx
+.endm
+
+.macro calc97
+ calc_f1_pre 0x94, %edx, %ecx, %esi, %eax
+ precalc33 %ymm7, %ymm8
+ calc_f1_post %edx, %ebx, %eax
+.endm
+
+.macro calc98
+ calc_f1_pre 0x98, %eax, %edx, %ebx, %edi
+ precalc34 %ymm15
+ calc_f1_post %eax, %ecx, %edi
+.endm
+
+.macro calc99
+ calc_f2_pre 0x9c, %edi, %eax, %esi
+ precalc35 %ymm8
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc100
+ calc_f2_pre 0xb0, %esi, %edi, %ebx
+ precalc36 %ymm8
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc101
+ calc_f2_pre 0xb4, %ebx, %esi, %ecx
+ precalc37 %ymm8
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc102
+ calc_f2_pre 0xb8, %ecx, %ebx, %edx
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc103
+ calc_f2_pre 0xbc, %edx, %ecx, %eax
+ precalc39 %ymm8, 0x40, 0x180
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc104
+ calc_f2_pre 0xd0, %eax, %edx, %edi
+ precalc32 %ymm12, %ymm8
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc105
+ calc_f2_pre 0xd4, %edi, %eax, %esi
+ precalc33 %ymm5, %ymm7
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc106
+ calc_f2_pre 0xd8, %esi, %edi, %ebx
+ precalc34 %ymm14
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc107
+ calc_f2_pre 0xdc, %ebx, %esi, %ecx
+ precalc35 %ymm7
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc108
+ calc_f2_pre 0xf0, %ecx, %ebx, %edx
+ precalc36 %ymm7
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc109
+ calc_f2_pre 0xf4, %edx, %ecx, %eax
+ precalc37 %ymm7
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc110
+ calc_f2_pre 0xf8, %eax, %edx, %edi
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc111
+ calc_f2_pre 0xfc, %edi, %eax, %esi
+ precalc39 %ymm7, 0x40, 0x1a0
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc112
+ calc_f2_pre 0x110, %esi, %edi, %ebx
+ precalc32 %ymm8, %ymm7
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc113
+ calc_f2_pre 0x114, %ebx, %esi, %ecx
+ precalc33 %ymm3, %ymm5
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc114
+ calc_f2_pre 0x118, %ecx, %ebx, %edx
+ precalc34 %ymm13
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc115
+ calc_f2_pre 0x11c, %edx, %ecx, %eax
+ precalc35 %ymm5
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc116
+ calc_f2_pre 0x130, %eax, %edx, %edi
+ precalc36 %ymm5
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc117
+ calc_f2_pre 0x134, %edi, %eax, %esi
+ precalc37 %ymm5
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc118
+ calc_f2_pre 0x138, %esi, %edi, %ebx
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc119
+ calc_f3_pre 0x13c, %ecx
+ precalc39 %ymm5, 0x40, 0x1c0
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc120
+ calc_f3_pre 0x150, %edx
+ precalc32 %ymm7, %ymm5
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc121
+ calc_f3_pre 0x154, %eax
+ precalc33 %ymm15, %ymm3
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc122
+ calc_f3_pre 0x158, %edi
+ precalc34 %ymm12
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc123
+ calc_f3_pre 0x15c, %esi
+ precalc35 %ymm3
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc124
+ calc_f3_pre 0x170, %ebx
+ precalc36 %ymm3
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc125
+ calc_f3_pre 0x174, %ecx
+ precalc37 %ymm3
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc126
+ calc_f3_pre 0x178, %edx
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc127
+ calc_f3_pre 0x17c, %eax
+ precalc39 %ymm3, 0x60, 0x1e0
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc128
+ calc_f3_pre 0x190, %edi
+ precalc32 %ymm5, %ymm3
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc129
+ calc_f3_pre 0x194, %esi
+ precalc33 %ymm14, %ymm15
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc130
+ calc_f3_pre 0x198, %ebx
+ precalc34 %ymm8
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc131
+ calc_f3_pre 0x19c, %ecx
+ precalc35 %ymm15
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc132
+ calc_f3_pre 0x1b0, %edx
+ precalc36 %ymm15
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc133
+ calc_f3_pre 0x1b4, %eax
+ precalc37 %ymm15
+ calc_f3_post %edx, %ebx, %esi, %eax, %ecx
+.endm
+
+.macro calc134
+ calc_f3_pre 0x1b8, %edi
+ calc_f3_post %eax, %ecx, %ebx, %edi, %edx
+.endm
+
+.macro calc135
+ calc_f3_pre 0x1bc, %esi
+ precalc39 %ymm15, 0x60, 0x200
+ calc_f3_post %edi, %edx, %ecx, %esi, %eax
+.endm
+
+.macro calc136
+ calc_f3_pre 0x1d0, %ebx
+ precalc32 %ymm3, %ymm15
+ calc_f3_post %esi, %eax, %edx, %ebx, %edi
+.endm
+
+.macro calc137
+ calc_f3_pre 0x1d4, %ecx
+ precalc33 %ymm13, %ymm14
+ calc_f3_post %ebx, %edi, %eax, %ecx, %esi
+.endm
+
+.macro calc138
+ calc_f3_pre 0x1d8, %edx
+ precalc34 %ymm7
+ calc_f3_post %ecx, %esi, %edi, %edx, %ebx
+.endm
+
+.macro calc139
+ calc_f2_pre 0x1dc, %edx, %ecx, %eax
+ precalc35 %ymm14
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc140
+ calc_f2_pre 0x1f0, %eax, %edx, %edi
+ precalc36 %ymm14
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc141
+ calc_f2_pre 0x1f4, %edi, %eax, %esi
+ precalc37 %ymm14
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc142
+ calc_f2_pre 0x1f8, %esi, %edi, %ebx
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc143
+ calc_f2_pre 0x1fc, %ebx, %esi, %ecx
+ precalc39 %ymm14, 0x60, 0x220
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc144
+ calc_f2_pre 0x210, %ecx, %ebx, %edx
+ precalc32 %ymm15, %ymm14
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc145
+ calc_f2_pre 0x214, %edx, %ecx, %eax
+ precalc33 %ymm12, %ymm13
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc146
+ calc_f2_pre 0x218, %eax, %edx, %edi
+ precalc34 %ymm5
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc147
+ calc_f2_pre 0x21c, %edi, %eax, %esi
+ precalc35 %ymm13
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc148
+ calc_f2_pre 0x230, %esi, %edi, %ebx
+ precalc36 %ymm13
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc149
+ calc_f2_pre 0x234, %ebx, %esi, %ecx
+ precalc37 %ymm13
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc150
+ calc_f2_pre 0x238, %ecx, %ebx, %edx
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc151
+ calc_f2_pre 0x23c, %edx, %ecx, %eax
+ precalc39 %ymm13, 0x60, 0x240
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc152
+ calc_f2_pre 0x250, %eax, %edx, %edi
+ precalc32 %ymm14, %ymm13
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc153
+ calc_f2_pre 0x254, %edi, %eax, %esi
+ precalc33 %ymm8, %ymm12
+ calc_f2_post %edi, %edx, %ecx, %esi
+.endm
+
+.macro calc154
+ calc_f2_pre 0x258, %esi, %edi, %ebx
+ precalc34 %ymm3
+ calc_f2_post %esi, %eax, %edx, %ebx
+.endm
+
+.macro calc155
+ calc_f2_pre 0x25c, %ebx, %esi, %ecx
+ precalc35 %ymm12
+ calc_f2_post %ebx, %edi, %eax, %ecx
+.endm
+
+.macro calc156
+ calc_f2_pre 0x270, %ecx, %ebx, %edx
+ precalc36 %ymm12
+ calc_f2_post %ecx, %esi, %edi, %edx
+.endm
+
+.macro calc157
+ calc_f2_pre 0x274, %edx, %ecx, %eax
+ precalc37 %ymm12
+ calc_f2_post %edx, %ebx, %esi, %eax
+.endm
+
+.macro calc158
+ calc_f2_pre 0x278, %eax, %edx, %edi
+ calc_f2_post %eax, %ecx, %ebx, %edi
+.endm
+
+.macro calc159
+ add 0x27c(%r15), %esi
+ add %eax, %esi
+ rorx $0x1b, %edi, %r12d
+ precalc39 %ymm12, 0x60, 0x260
+ add %r12d, %esi
+.endm
+
+ // sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_avx2)
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ sub $1408+8, %rsp
+
+ and $~63, %rdx
+ lea k_xmm_ar(%rip), %r8
+ mov %rdi, %r9
+ mov %rsi, %r10
+ lea 64(%rsi), %r13
+ lea 64(%rsi, %rdx), %r11
+ cmp %r11, %r13
+ cmovae %r8, %r13
+ vmovdqu bswap_shufb_ctl(%rip), %ymm10
+
+ mov (%r9), %ecx
+ mov 4(%r9), %esi
+ mov 8(%r9), %edi
+ mov 12(%r9), %eax
+ mov 16(%r9), %edx
+ mov %rsp, %r14
+ lea 2*4*80+32(%rsp), %r15
+ precalc // precalc WK for first 2 blocks
+ xchg %r14, %r15
+
+ // this is unrolled
+.Loop: cmp %r8, %r10 // we use the value of R8 (set below)
+ // as a signal of the last block
+ jne .Lbegin
+ add $1408+8, %rsp
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ vzeroupper
+ ret
+
+.Lbegin:
+ calc0
+ calc1
+ calc2
+ calc3
+ calc4
+ calc5
+ calc6
+ calc7
+ calc8
+ calc9
+ calc10
+ calc11
+ calc12
+ calc13
+ calc14
+ calc15
+ calc16
+ calc17
+ calc18
+ calc19
+ calc20
+ calc21
+ calc22
+ calc23
+ calc24
+ calc25
+ calc26
+ calc27
+ calc28
+ calc29
+ calc30
+ calc31
+ calc32
+ calc33
+ calc34
+ calc35
+ calc36
+ calc37
+ calc38
+ calc39
+ calc40
+ calc41
+ calc42
+ calc43
+ calc44
+ calc45
+ calc46
+ calc47
+ calc48
+ calc49
+ calc50
+ calc51
+ calc52
+ calc53
+ calc54
+ calc55
+ calc56
+ calc57
+ calc58
+ calc59
+
+ add $128, %r10 // move to the next even-64-byte block
+ cmp %r11, %r10 // is the current block the last one?
+ cmovae %r8, %r10 // signal the last iteration smartly
+
+ calc60
+ calc61
+ calc62
+ calc63
+ calc64
+ calc65
+ calc66
+ calc67
+ calc68
+ calc69
+ calc70
+ calc71
+ calc72
+ calc73
+ calc74
+ calc75
+ calc76
+ calc77
+ calc78
+ calc79
+
+ update_hash %eax, %edx, %ebx, %esi, %edi
+ cmp %r8, %r10 // is the current block the last one?
+ je .Loop
+ mov %edx, %ecx
+
+ calc80
+ calc81
+ calc82
+ calc83
+ calc84
+ calc85
+ calc86
+ calc87
+ calc88
+ calc89
+ calc90
+ calc91
+ calc92
+ calc93
+ calc94
+ calc95
+ calc96
+ calc97
+ calc98
+ calc99
+ calc100
+ calc101
+ calc102
+ calc103
+ calc104
+ calc105
+ calc106
+ calc107
+ calc108
+ calc109
+ calc110
+ calc111
+ calc112
+ calc113
+ calc114
+ calc115
+ calc116
+ calc117
+ calc118
+ calc119
+ calc120
+ calc121
+ calc122
+ calc123
+ calc124
+ calc125
+ calc126
+ calc127
+ calc128
+ calc129
+ calc130
+ calc131
+ calc132
+ calc133
+ calc134
+ calc135
+ calc136
+ calc137
+ calc138
+ calc139
+
+ add $128, %r13 // move to the next even-64-byte block
+ cmp %r11, %r13 // is the current block the last one?
+ cmovae %r8, %r10
+
+ calc140
+ calc141
+ calc142
+ calc143
+ calc144
+ calc145
+ calc146
+ calc147
+ calc148
+ calc149
+ calc150
+ calc151
+ calc152
+ calc153
+ calc154
+ calc155
+ calc156
+ calc157
+ calc158
+ calc159
+
+ update_hash %esi, %edi, %edx, %ecx, %ebx
+ mov %esi, %r12d // reset state for AVX2 reg permutation
+ mov %edi, %esi
+ mov %edx, %edi
+ mov %ebx, %edx
+ mov %ecx, %eax
+ mov %r12d, %ecx
+ xchg %r14, %r15
+ jmp .Loop
+END(_libmd_sha1block_avx2)
+
+ .section .rodata
+ .balign 32
+k_xmm_ar:
+ .fill 8, 4, 0x5a827999
+ .fill 8, 4, 0x6ed9eba1
+ .fill 8, 4, 0x8f1bbcdc
+ .fill 8, 4, 0xca62c1d6
+ .size k_xmm_ar, .-k_xmm_ar
+
+bswap_shufb_ctl:
+ .4byte 0x00010203
+ .4byte 0x04050607
+ .4byte 0x08090a0b
+ .4byte 0x0c0d0e0f
+ .4byte 0x00010203
+ .4byte 0x04050607
+ .4byte 0x08090a0b
+ .4byte 0x0c0d0e0f
+ .size bswap_shufb_ctl, .-bswap_shufb_ctl
+
+ /*
+ * SHA1 implementation using the Intel SHA extensions (SHANI).
+ *
+ * Imlemented according to the Intel white paper
+ *
+ * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford,
+ * G. Wolrich: "Intel SHA Extensions: new instruction supporting
+ * the Secure Hash Algorithm on IntelĀ® architecture processors",
+ * July 2013.
+ */
+ // sha1block(SHA1_CTX, buf, len)
+ENTRY(_libmd_sha1block_shani)
+ and $~63, %rdx // round length to block-size multiple
+ lea (%rsi, %rdx, 1), %rcx // end pointer
+ test %rdx, %rdx // nothing to do?
+ je 1f // if so, terminate immediately
+
+ movdqu (%rdi), %xmm6 // h0, h1, h2, h3
+ pxor %xmm7, %xmm7
+ pshufd $0x1b, %xmm6, %xmm6 // h3, h2, h1, h0
+ pinsrd $3, 16(%rdi), %xmm7 // h4 in the highest word of xmm7
+ movdqu shuf_mask(%rip), %xmm4
+
+ // main loop
+0: movdqa %xmm6, %xmm8 // stash ABCD
+ movdqa %xmm7, %xmm9 // stash E
+
+ // rounds 0--3
+ movdqu 0*16(%rsi), %xmm0 // load first message block
+ pshufb %xmm4, %xmm0 // and byte-swap
+ paddd %xmm0, %xmm7 // E += w[0]
+ movdqa %xmm6, %xmm5 // E' = A
+ sha1rnds4 $0, %xmm7, %xmm6 // perform rounds 0--3
+
+ // rounds 4--7
+ movdqu 1*16(%rsi), %xmm1
+ pshufb %xmm4, %xmm1
+ sha1nexte %xmm1, %xmm5
+ movdqa %xmm6, %xmm7
+ sha1rnds4 $0, %xmm5, %xmm6
+ sha1msg1 %xmm1, %xmm0
+
+ // rounds 8--11
+ movdqu 2*16(%rsi), %xmm2
+ pshufb %xmm4, %xmm2
+ sha1nexte %xmm2, %xmm7
+ movdqa %xmm6, %xmm5
+ sha1rnds4 $0, %xmm7, %xmm6
+ sha1msg1 %xmm2, %xmm1
+ pxor %xmm2, %xmm0
+
+.macro midround msg3, msg0, msg1, msg2, e1, e0, k
+ sha1nexte \msg3, \e1
+ movdqa %xmm6, \e0
+ sha1msg2 \msg3, \msg0
+ sha1rnds4 $\k, \e1, %xmm6
+ sha1msg1 \msg3, \msg2
+ pxor \msg3, \msg1
+.endm
+
+ movdqu 3*16(%rsi), %xmm3 // load third message block
+ pshufb %xmm4, %xmm3
+
+ add $4*16, %rsi
+
+ midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0 // 12--15
+ midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0 // 16--19
+ midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 20--23
+ midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1 // 24--27
+ midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1 // 28--31
+ midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1 // 32--35
+ midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 36--39
+ midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 40--43
+ midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2 // 44--47
+ midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2 // 48--51
+ midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2 // 52--55
+ midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 56--59
+ midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3 // 60--63
+ midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3 // 64--67
+
+ // rounds 68--71
+ sha1nexte %xmm1, %xmm5
+ movdqa %xmm6, %xmm7
+ sha1msg2 %xmm1, %xmm2
+ sha1rnds4 $3, %xmm5, %xmm6
+ pxor %xmm1, %xmm3
+
+ // rounds 72--75
+ sha1nexte %xmm2, %xmm7
+ movdqa %xmm6, %xmm5
+ sha1msg2 %xmm2, %xmm3
+ sha1rnds4 $3, %xmm7, %xmm6
+
+ // rounds 76--79
+ sha1nexte %xmm3, %xmm5
+ movdqa %xmm6, %xmm7
+ sha1rnds4 $3, %xmm5, %xmm6
+
+ sha1nexte %xmm9, %xmm7 // add saved E
+ paddd %xmm8, %xmm6 // add saved ABCD
+
+ cmp %rsi, %rcx // end reached?
+ jne 0b
+
+ pshufd $0x1b, %xmm6, %xmm6 // restore order of h0--h3
+ movdqu %xmm6, (%rdi) // write h0--h3
+ pextrd $3, %xmm7, 16(%rdi) // write h4
+1: ret
+END(_libmd_sha1block_shani)
+
+ .section .rodata
+ .balign 16
+shuf_mask:
+ .8byte 0x08090a0b0c0d0e0f
+ .8byte 0x0001020304050607
+ .size shuf_mask, .-shuf_mask
+
+ .section .note.GNU-stack,"",%progbits
diff --git a/lib/libmd/amd64/sha1dispatch.c b/lib/libmd/amd64/sha1dispatch.c
new file mode 100644
index 000000000000..86509195d56e
--- /dev/null
+++ b/lib/libmd/amd64/sha1dispatch.c
@@ -0,0 +1,77 @@
+/*-
+ * Copyright (c) 2016 The Go Authors. All rights reserved.
+ * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
+ *
+ * Adapted from Go's crypto/sha1/sha1block_amd64.go.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <machine/specialreg.h>
+#include <sha.h>
+#include <x86/ifunc.h>
+
+extern void _libmd_sha1block_scalar(SHA1_CTX *, const void *, size_t);
+extern void _libmd_sha1block_avx2(SHA1_CTX *, const void *, size_t);
+extern void _libmd_sha1block_shani(SHA1_CTX *, const void *, size_t);
+static void sha1block_avx2_wrapper(SHA1_CTX *, const void *, size_t);
+
+#define AVX2_STDEXT_NEEDED \
+ (CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2)
+
+DEFINE_UIFUNC(, void, sha1_block, (SHA1_CTX *, const void *, size_t))
+{
+ if (cpu_stdext_feature & CPUID_STDEXT_SHA)
+ return (_libmd_sha1block_shani);
+ if ((cpu_stdext_feature & AVX2_STDEXT_NEEDED) == AVX2_STDEXT_NEEDED)
+ return (sha1block_avx2_wrapper);
+ else
+ return (_libmd_sha1block_scalar);
+}
+
+static void
+sha1block_avx2_wrapper(SHA1_CTX *c, const void *data, size_t len)
+{
+ if (len >= 256) {
+ /*
+ * sha1block_avx2 calculates sha1 for 2 block per iteration.
+ * It also interleaves the precalculation for next the block.
+ * So it may read up-to 192 bytes past the end of p.
+ * We may add checks inside sha1block_avx2, but this will
+ * just turn it into a copy of sha1block_scalar,
+ * so call it directly, instead.
+ */
+ size_t safe_len = len - 128;
+
+ if (safe_len % 128 != 0)
+ safe_len -= 64;
+
+ _libmd_sha1block_avx2(c, data, safe_len);
+ _libmd_sha1block_scalar(c, data + safe_len, len - safe_len);
+ } else
+ _libmd_sha1block_scalar(c, data, len);
+}