aboutsummaryrefslogtreecommitdiff
path: root/sys/dev
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev')
-rw-r--r--sys/dev/if_wg/crypto.c1705
-rw-r--r--sys/dev/if_wg/crypto.h114
-rw-r--r--sys/dev/if_wg/if_wg.c3454
-rw-r--r--sys/dev/if_wg/if_wg.h36
-rw-r--r--sys/dev/if_wg/include/crypto/blake2s.h56
-rw-r--r--sys/dev/if_wg/include/crypto/curve25519.h74
-rw-r--r--sys/dev/if_wg/include/crypto/zinc.h15
-rw-r--r--sys/dev/if_wg/include/sys/if_wg_session.h89
-rw-r--r--sys/dev/if_wg/include/sys/if_wg_session_vars.h319
-rw-r--r--sys/dev/if_wg/include/sys/simd-x86_64.h74
-rw-r--r--sys/dev/if_wg/include/sys/support.h342
-rw-r--r--sys/dev/if_wg/include/sys/wg_module.h121
-rw-r--r--sys/dev/if_wg/include/sys/wg_noise.h286
-rw-r--r--sys/dev/if_wg/include/zinc/blake2s.h50
-rw-r--r--sys/dev/if_wg/include/zinc/chacha20.h68
-rw-r--r--sys/dev/if_wg/include/zinc/chacha20poly1305.h48
-rw-r--r--sys/dev/if_wg/include/zinc/curve25519.h28
-rw-r--r--sys/dev/if_wg/include/zinc/poly1305.h29
-rw-r--r--sys/dev/if_wg/module/blake2s.c256
-rw-r--r--sys/dev/if_wg/module/blake2s.h58
-rw-r--r--sys/dev/if_wg/module/chacha20-x86_64.S2834
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c98
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl1227
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl1163
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c27
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S424
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c132
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl4106
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c238
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c196
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c140
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl1276
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl974
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna32.c205
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-donna64.c182
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips-glue.c37
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips.S407
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-mips64.pl467
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64-glue.c171
-rwxr-xr-xsys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-x86_64.pl4266
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305.c163
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/blake2s.c2090
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/chacha20.c2703
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/chacha20poly1305.c8443
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/curve25519.c1315
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/poly1305.c1110
-rw-r--r--sys/dev/if_wg/module/crypto/zinc/selftest/run.h43
-rw-r--r--sys/dev/if_wg/module/curve25519.c867
-rw-r--r--sys/dev/if_wg/module/if_wg_session.c1984
-rw-r--r--sys/dev/if_wg/module/module.c954
-rw-r--r--sys/dev/if_wg/module/poly1305-x86_64.S3021
-rw-r--r--sys/dev/if_wg/support.h56
-rw-r--r--sys/dev/if_wg/wg_cookie.c (renamed from sys/dev/if_wg/module/wg_cookie.c)105
-rw-r--r--sys/dev/if_wg/wg_cookie.h (renamed from sys/dev/if_wg/include/sys/wg_cookie.h)81
-rw-r--r--sys/dev/if_wg/wg_noise.c (renamed from sys/dev/if_wg/module/wg_noise.c)409
-rw-r--r--sys/dev/if_wg/wg_noise.h191
56 files changed, 5851 insertions, 43476 deletions
diff --git a/sys/dev/if_wg/crypto.c b/sys/dev/if_wg/crypto.c
new file mode 100644
index 000000000000..f28585429272
--- /dev/null
+++ b/sys/dev/if_wg/crypto.c
@@ -0,0 +1,1705 @@
+/*
+ * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/endian.h>
+#include <sys/systm.h>
+
+#include "crypto.h"
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+#ifndef __aligned
+#define __aligned(x) __attribute__((aligned(x)))
+#endif
+#ifndef DIV_ROUND_UP
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#endif
+
+#define le32_to_cpup(a) le32toh(*(a))
+#define le64_to_cpup(a) le64toh(*(a))
+#define cpu_to_le32(a) htole32(a)
+#define cpu_to_le64(a) htole64(a)
+
+static inline uint32_t get_unaligned_le32(const uint8_t *a)
+{
+ uint32_t l;
+ __builtin_memcpy(&l, a, sizeof(l));
+ return le32_to_cpup(&l);
+}
+static inline uint64_t get_unaligned_le64(const uint8_t *a)
+{
+ uint64_t l;
+ __builtin_memcpy(&l, a, sizeof(l));
+ return le64_to_cpup(&l);
+}
+static inline void put_unaligned_le32(uint32_t s, uint8_t *d)
+{
+ uint32_t l = cpu_to_le32(s);
+ __builtin_memcpy(d, &l, sizeof(l));
+}
+static inline void cpu_to_le32_array(uint32_t *buf, unsigned int words)
+{
+ while (words--) {
+ *buf = cpu_to_le32(*buf);
+ ++buf;
+ }
+}
+static inline void le32_to_cpu_array(uint32_t *buf, unsigned int words)
+{
+ while (words--) {
+ *buf = le32_to_cpup(buf);
+ ++buf;
+ }
+}
+
+static inline uint32_t rol32(uint32_t word, unsigned int shift)
+{
+ return (word << (shift & 31)) | (word >> ((-shift) & 31));
+}
+static inline uint32_t ror32(uint32_t word, unsigned int shift)
+{
+ return (word >> (shift & 31)) | (word << ((-shift) & 31));
+}
+
+static void xor_cpy(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+ size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; ++i)
+ dst[i] = src1[i] ^ src2[i];
+}
+
+#define QUARTER_ROUND(x, a, b, c, d) ( \
+ x[a] += x[b], \
+ x[d] = rol32((x[d] ^ x[a]), 16), \
+ x[c] += x[d], \
+ x[b] = rol32((x[b] ^ x[c]), 12), \
+ x[a] += x[b], \
+ x[d] = rol32((x[d] ^ x[a]), 8), \
+ x[c] += x[d], \
+ x[b] = rol32((x[b] ^ x[c]), 7) \
+)
+
+#define C(i, j) (i * 4 + j)
+
+#define DOUBLE_ROUND(x) ( \
+ /* Column Round */ \
+ QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
+ QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
+ QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
+ QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
+ /* Diagonal Round */ \
+ QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
+ QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
+ QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
+ QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
+)
+
+#define TWENTY_ROUNDS(x) ( \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x), \
+ DOUBLE_ROUND(x) \
+)
+
+enum chacha20_lengths {
+ CHACHA20_NONCE_SIZE = 16,
+ CHACHA20_KEY_SIZE = 32,
+ CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(uint32_t),
+ CHACHA20_BLOCK_SIZE = 64,
+ CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(uint32_t),
+ HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE,
+ HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE
+};
+
+enum chacha20_constants { /* expand 32-byte k */
+ CHACHA20_CONSTANT_EXPA = 0x61707865U,
+ CHACHA20_CONSTANT_ND_3 = 0x3320646eU,
+ CHACHA20_CONSTANT_2_BY = 0x79622d32U,
+ CHACHA20_CONSTANT_TE_K = 0x6b206574U
+};
+
+struct chacha20_ctx {
+ union {
+ uint32_t state[16];
+ struct {
+ uint32_t constant[4];
+ uint32_t key[8];
+ uint32_t counter[4];
+ };
+ };
+};
+
+static void chacha20_init(struct chacha20_ctx *ctx,
+ const uint8_t key[CHACHA20_KEY_SIZE],
+ const uint64_t nonce)
+{
+ ctx->constant[0] = CHACHA20_CONSTANT_EXPA;
+ ctx->constant[1] = CHACHA20_CONSTANT_ND_3;
+ ctx->constant[2] = CHACHA20_CONSTANT_2_BY;
+ ctx->constant[3] = CHACHA20_CONSTANT_TE_K;
+ ctx->key[0] = get_unaligned_le32(key + 0);
+ ctx->key[1] = get_unaligned_le32(key + 4);
+ ctx->key[2] = get_unaligned_le32(key + 8);
+ ctx->key[3] = get_unaligned_le32(key + 12);
+ ctx->key[4] = get_unaligned_le32(key + 16);
+ ctx->key[5] = get_unaligned_le32(key + 20);
+ ctx->key[6] = get_unaligned_le32(key + 24);
+ ctx->key[7] = get_unaligned_le32(key + 28);
+ ctx->counter[0] = 0;
+ ctx->counter[1] = 0;
+ ctx->counter[2] = nonce & 0xffffffffU;
+ ctx->counter[3] = nonce >> 32;
+}
+
+static void chacha20_block(struct chacha20_ctx *ctx, uint32_t *stream)
+{
+ uint32_t x[CHACHA20_BLOCK_WORDS];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(x); ++i)
+ x[i] = ctx->state[i];
+
+ TWENTY_ROUNDS(x);
+
+ for (i = 0; i < ARRAY_SIZE(x); ++i)
+ stream[i] = cpu_to_le32(x[i] + ctx->state[i]);
+
+ ctx->counter[0] += 1;
+}
+
+static void chacha20(struct chacha20_ctx *ctx, uint8_t *out, const uint8_t *in,
+ uint32_t len)
+{
+ uint32_t buf[CHACHA20_BLOCK_WORDS];
+
+ while (len >= CHACHA20_BLOCK_SIZE) {
+ chacha20_block(ctx, buf);
+ xor_cpy(out, in, (uint8_t *)buf, CHACHA20_BLOCK_SIZE);
+ len -= CHACHA20_BLOCK_SIZE;
+ out += CHACHA20_BLOCK_SIZE;
+ in += CHACHA20_BLOCK_SIZE;
+ }
+ if (len) {
+ chacha20_block(ctx, buf);
+ xor_cpy(out, in, (uint8_t *)buf, len);
+ }
+}
+
+static void hchacha20(uint32_t derived_key[CHACHA20_KEY_WORDS],
+ const uint8_t nonce[HCHACHA20_NONCE_SIZE],
+ const uint8_t key[HCHACHA20_KEY_SIZE])
+{
+ uint32_t x[] = { CHACHA20_CONSTANT_EXPA,
+ CHACHA20_CONSTANT_ND_3,
+ CHACHA20_CONSTANT_2_BY,
+ CHACHA20_CONSTANT_TE_K,
+ get_unaligned_le32(key + 0),
+ get_unaligned_le32(key + 4),
+ get_unaligned_le32(key + 8),
+ get_unaligned_le32(key + 12),
+ get_unaligned_le32(key + 16),
+ get_unaligned_le32(key + 20),
+ get_unaligned_le32(key + 24),
+ get_unaligned_le32(key + 28),
+ get_unaligned_le32(nonce + 0),
+ get_unaligned_le32(nonce + 4),
+ get_unaligned_le32(nonce + 8),
+ get_unaligned_le32(nonce + 12)
+ };
+
+ TWENTY_ROUNDS(x);
+
+ memcpy(derived_key + 0, x + 0, sizeof(uint32_t) * 4);
+ memcpy(derived_key + 4, x + 12, sizeof(uint32_t) * 4);
+}
+
+enum poly1305_lengths {
+ POLY1305_BLOCK_SIZE = 16,
+ POLY1305_KEY_SIZE = 32,
+ POLY1305_MAC_SIZE = 16
+};
+
+struct poly1305_internal {
+ uint32_t h[5];
+ uint32_t r[5];
+ uint32_t s[4];
+};
+
+struct poly1305_ctx {
+ struct poly1305_internal state;
+ uint32_t nonce[4];
+ uint8_t data[POLY1305_BLOCK_SIZE];
+ size_t num;
+};
+
+static void poly1305_init_core(struct poly1305_internal *st,
+ const uint8_t key[16])
+{
+ /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
+ st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff;
+ st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03;
+ st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff;
+ st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff;
+ st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff;
+
+ /* s = 5*r */
+ st->s[0] = st->r[1] * 5;
+ st->s[1] = st->r[2] * 5;
+ st->s[2] = st->r[3] * 5;
+ st->s[3] = st->r[4] * 5;
+
+ /* h = 0 */
+ st->h[0] = 0;
+ st->h[1] = 0;
+ st->h[2] = 0;
+ st->h[3] = 0;
+ st->h[4] = 0;
+}
+
+static void poly1305_blocks_core(struct poly1305_internal *st,
+ const uint8_t *input, size_t len,
+ const uint32_t padbit)
+{
+ const uint32_t hibit = padbit << 24;
+ uint32_t r0, r1, r2, r3, r4;
+ uint32_t s1, s2, s3, s4;
+ uint32_t h0, h1, h2, h3, h4;
+ uint64_t d0, d1, d2, d3, d4;
+ uint32_t c;
+
+ r0 = st->r[0];
+ r1 = st->r[1];
+ r2 = st->r[2];
+ r3 = st->r[3];
+ r4 = st->r[4];
+
+ s1 = st->s[0];
+ s2 = st->s[1];
+ s3 = st->s[2];
+ s4 = st->s[3];
+
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ while (len >= POLY1305_BLOCK_SIZE) {
+ /* h += m[i] */
+ h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff;
+ h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff;
+ h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff;
+ h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff;
+ h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit;
+
+ /* h *= r */
+ d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) +
+ ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) +
+ ((uint64_t)h4 * s1);
+ d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) +
+ ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) +
+ ((uint64_t)h4 * s2);
+ d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) +
+ ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) +
+ ((uint64_t)h4 * s3);
+ d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) +
+ ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) +
+ ((uint64_t)h4 * s4);
+ d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) +
+ ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) +
+ ((uint64_t)h4 * r0);
+
+ /* (partial) h %= p */
+ c = (uint32_t)(d0 >> 26);
+ h0 = (uint32_t)d0 & 0x3ffffff;
+ d1 += c;
+ c = (uint32_t)(d1 >> 26);
+ h1 = (uint32_t)d1 & 0x3ffffff;
+ d2 += c;
+ c = (uint32_t)(d2 >> 26);
+ h2 = (uint32_t)d2 & 0x3ffffff;
+ d3 += c;
+ c = (uint32_t)(d3 >> 26);
+ h3 = (uint32_t)d3 & 0x3ffffff;
+ d4 += c;
+ c = (uint32_t)(d4 >> 26);
+ h4 = (uint32_t)d4 & 0x3ffffff;
+ h0 += c * 5;
+ c = (h0 >> 26);
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ input += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ }
+
+ st->h[0] = h0;
+ st->h[1] = h1;
+ st->h[2] = h2;
+ st->h[3] = h3;
+ st->h[4] = h4;
+}
+
+static void poly1305_emit_core(struct poly1305_internal *st, uint8_t mac[16],
+ const uint32_t nonce[4])
+{
+ uint32_t h0, h1, h2, h3, h4, c;
+ uint32_t g0, g1, g2, g3, g4;
+ uint64_t f;
+ uint32_t mask;
+
+ /* fully carry h */
+ h0 = st->h[0];
+ h1 = st->h[1];
+ h2 = st->h[2];
+ h3 = st->h[3];
+ h4 = st->h[4];
+
+ c = h1 >> 26;
+ h1 = h1 & 0x3ffffff;
+ h2 += c;
+ c = h2 >> 26;
+ h2 = h2 & 0x3ffffff;
+ h3 += c;
+ c = h3 >> 26;
+ h3 = h3 & 0x3ffffff;
+ h4 += c;
+ c = h4 >> 26;
+ h4 = h4 & 0x3ffffff;
+ h0 += c * 5;
+ c = h0 >> 26;
+ h0 = h0 & 0x3ffffff;
+ h1 += c;
+
+ /* compute h + -p */
+ g0 = h0 + 5;
+ c = g0 >> 26;
+ g0 &= 0x3ffffff;
+ g1 = h1 + c;
+ c = g1 >> 26;
+ g1 &= 0x3ffffff;
+ g2 = h2 + c;
+ c = g2 >> 26;
+ g2 &= 0x3ffffff;
+ g3 = h3 + c;
+ c = g3 >> 26;
+ g3 &= 0x3ffffff;
+ g4 = h4 + c - (1UL << 26);
+
+ /* select h if h < p, or h + -p if h >= p */
+ mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1;
+ g0 &= mask;
+ g1 &= mask;
+ g2 &= mask;
+ g3 &= mask;
+ g4 &= mask;
+ mask = ~mask;
+
+ h0 = (h0 & mask) | g0;
+ h1 = (h1 & mask) | g1;
+ h2 = (h2 & mask) | g2;
+ h3 = (h3 & mask) | g3;
+ h4 = (h4 & mask) | g4;
+
+ /* h = h % (2^128) */
+ h0 = ((h0) | (h1 << 26)) & 0xffffffff;
+ h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff;
+ h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
+ h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff;
+
+ /* mac = (h + nonce) % (2^128) */
+ f = (uint64_t)h0 + nonce[0];
+ h0 = (uint32_t)f;
+ f = (uint64_t)h1 + nonce[1] + (f >> 32);
+ h1 = (uint32_t)f;
+ f = (uint64_t)h2 + nonce[2] + (f >> 32);
+ h2 = (uint32_t)f;
+ f = (uint64_t)h3 + nonce[3] + (f >> 32);
+ h3 = (uint32_t)f;
+
+ put_unaligned_le32(h0, &mac[0]);
+ put_unaligned_le32(h1, &mac[4]);
+ put_unaligned_le32(h2, &mac[8]);
+ put_unaligned_le32(h3, &mac[12]);
+}
+
+static void poly1305_init(struct poly1305_ctx *ctx,
+ const uint8_t key[POLY1305_KEY_SIZE])
+{
+ ctx->nonce[0] = get_unaligned_le32(&key[16]);
+ ctx->nonce[1] = get_unaligned_le32(&key[20]);
+ ctx->nonce[2] = get_unaligned_le32(&key[24]);
+ ctx->nonce[3] = get_unaligned_le32(&key[28]);
+
+ poly1305_init_core(&ctx->state, key);
+
+ ctx->num = 0;
+}
+
+static void poly1305_update(struct poly1305_ctx *ctx, const uint8_t *input,
+ size_t len)
+{
+ const size_t num = ctx->num;
+ size_t rem;
+
+ if (num) {
+ rem = POLY1305_BLOCK_SIZE - num;
+ if (len < rem) {
+ memcpy(ctx->data + num, input, len);
+ ctx->num = num + len;
+ return;
+ }
+ memcpy(ctx->data + num, input, rem);
+ poly1305_blocks_core(&ctx->state, ctx->data,
+ POLY1305_BLOCK_SIZE, 1);
+ input += rem;
+ len -= rem;
+ }
+
+ rem = len % POLY1305_BLOCK_SIZE;
+ len -= rem;
+
+ if (len >= POLY1305_BLOCK_SIZE) {
+ poly1305_blocks_core(&ctx->state, input, len, 1);
+ input += len;
+ }
+
+ if (rem)
+ memcpy(ctx->data, input, rem);
+
+ ctx->num = rem;
+}
+
+static void poly1305_final(struct poly1305_ctx *ctx,
+ uint8_t mac[POLY1305_MAC_SIZE])
+{
+ size_t num = ctx->num;
+
+ if (num) {
+ ctx->data[num++] = 1;
+ while (num < POLY1305_BLOCK_SIZE)
+ ctx->data[num++] = 0;
+ poly1305_blocks_core(&ctx->state, ctx->data,
+ POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_emit_core(&ctx->state, mac, ctx->nonce);
+
+ explicit_bzero(ctx, sizeof(*ctx));
+}
+
+
+static const uint8_t pad0[16] = { 0 };
+
+void
+chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
+ const uint8_t *ad, const size_t ad_len,
+ const uint64_t nonce,
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE])
+{
+ struct poly1305_ctx poly1305_state;
+ struct chacha20_ctx chacha20_state;
+ union {
+ uint8_t block0[POLY1305_KEY_SIZE];
+ uint64_t lens[2];
+ } b = { { 0 } };
+
+ chacha20_init(&chacha20_state, key, nonce);
+ chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0));
+ poly1305_init(&poly1305_state, b.block0);
+
+ poly1305_update(&poly1305_state, ad, ad_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
+
+ chacha20(&chacha20_state, dst, src, src_len);
+
+ poly1305_update(&poly1305_state, dst, src_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf);
+
+ b.lens[0] = cpu_to_le64(ad_len);
+ b.lens[1] = cpu_to_le64(src_len);
+ poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens));
+
+ poly1305_final(&poly1305_state, dst + src_len);
+
+ explicit_bzero(&chacha20_state, sizeof(chacha20_state));
+ explicit_bzero(&b, sizeof(b));
+}
+
+bool
+chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
+ const uint8_t *ad, const size_t ad_len,
+ const uint64_t nonce,
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE])
+{
+ struct poly1305_ctx poly1305_state;
+ struct chacha20_ctx chacha20_state;
+ bool ret;
+ size_t dst_len;
+ union {
+ uint8_t block0[POLY1305_KEY_SIZE];
+ uint8_t mac[POLY1305_MAC_SIZE];
+ uint64_t lens[2];
+ } b = { { 0 } };
+
+ if (src_len < POLY1305_MAC_SIZE)
+ return false;
+
+ chacha20_init(&chacha20_state, key, nonce);
+ chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0));
+ poly1305_init(&poly1305_state, b.block0);
+
+ poly1305_update(&poly1305_state, ad, ad_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf);
+
+ dst_len = src_len - POLY1305_MAC_SIZE;
+ poly1305_update(&poly1305_state, src, dst_len);
+ poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf);
+
+ b.lens[0] = cpu_to_le64(ad_len);
+ b.lens[1] = cpu_to_le64(dst_len);
+ poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens));
+
+ poly1305_final(&poly1305_state, b.mac);
+
+ ret = timingsafe_bcmp(b.mac, src + dst_len, POLY1305_MAC_SIZE) == 0;
+ if (ret)
+ chacha20(&chacha20_state, dst, src, dst_len);
+
+ explicit_bzero(&chacha20_state, sizeof(chacha20_state));
+ explicit_bzero(&b, sizeof(b));
+
+ return ret;
+}
+
+void
+xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src,
+ const size_t src_len, const uint8_t *ad,
+ const size_t ad_len,
+ const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE])
+{
+ uint32_t derived_key[CHACHA20_KEY_WORDS];
+
+ hchacha20(derived_key, nonce, key);
+ cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key));
+ chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
+ get_unaligned_le64(nonce + 16),
+ (uint8_t *)derived_key);
+ explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE);
+}
+
+bool
+xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src,
+ const size_t src_len, const uint8_t *ad,
+ const size_t ad_len,
+ const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE])
+{
+ bool ret;
+ uint32_t derived_key[CHACHA20_KEY_WORDS];
+
+ hchacha20(derived_key, nonce, key);
+ cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key));
+ ret = chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
+ get_unaligned_le64(nonce + 16),
+ (uint8_t *)derived_key);
+ explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE);
+ return ret;
+}
+
+
+static const uint32_t blake2s_iv[8] = {
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_set_lastblock(struct blake2s_state *state)
+{
+ state->f[0] = -1;
+}
+
+static inline void blake2s_increment_counter(struct blake2s_state *state,
+ const uint32_t inc)
+{
+ state->t[0] += inc;
+ state->t[1] += (state->t[0] < inc);
+}
+
+static inline void blake2s_init_param(struct blake2s_state *state,
+ const uint32_t param)
+{
+ int i;
+
+ memset(state, 0, sizeof(*state));
+ for (i = 0; i < 8; ++i)
+ state->h[i] = blake2s_iv[i];
+ state->h[0] ^= param;
+}
+
+void blake2s_init(struct blake2s_state *state, const size_t outlen)
+{
+ blake2s_init_param(state, 0x01010000 | outlen);
+ state->outlen = outlen;
+}
+
+void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
+ const uint8_t *key, const size_t keylen)
+{
+ uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 };
+
+ blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen);
+ state->outlen = outlen;
+ memcpy(block, key, keylen);
+ blake2s_update(state, block, BLAKE2S_BLOCK_SIZE);
+ explicit_bzero(block, BLAKE2S_BLOCK_SIZE);
+}
+
+static inline void blake2s_compress(struct blake2s_state *state,
+ const uint8_t *block, size_t nblocks,
+ const uint32_t inc)
+{
+ uint32_t m[16];
+ uint32_t v[16];
+ int i;
+
+ while (nblocks > 0) {
+ blake2s_increment_counter(state, inc);
+ memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+ le32_to_cpu_array(m, ARRAY_SIZE(m));
+ memcpy(v, state->h, 32);
+ v[ 8] = blake2s_iv[0];
+ v[ 9] = blake2s_iv[1];
+ v[10] = blake2s_iv[2];
+ v[11] = blake2s_iv[3];
+ v[12] = blake2s_iv[4] ^ state->t[0];
+ v[13] = blake2s_iv[5] ^ state->t[1];
+ v[14] = blake2s_iv[6] ^ state->f[0];
+ v[15] = blake2s_iv[7] ^ state->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+ d = ror32(d ^ a, 16); \
+ c += d; \
+ b = ror32(b ^ c, 12); \
+ a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+ d = ror32(d ^ a, 8); \
+ c += d; \
+ b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ state->h[i] ^= v[i] ^ v[i + 8];
+
+ block += BLAKE2S_BLOCK_SIZE;
+ --nblocks;
+ }
+}
+
+void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen)
+{
+ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+ if (!inlen)
+ return;
+ if (inlen > fill) {
+ memcpy(state->buf + state->buflen, in, fill);
+ blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
+ state->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2S_BLOCK_SIZE) {
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+ /* Hash one less (full) block than strictly possible */
+ blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ }
+ memcpy(state->buf + state->buflen, in, inlen);
+ state->buflen += inlen;
+}
+
+void blake2s_final(struct blake2s_state *state, uint8_t *out)
+{
+ blake2s_set_lastblock(state);
+ memset(state->buf + state->buflen, 0,
+ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+ blake2s_compress(state, state->buf, 1, state->buflen);
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+ memcpy(out, state->h, state->outlen);
+ explicit_bzero(state, sizeof(*state));
+}
+
+void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
+ const size_t outlen, const size_t inlen, const size_t keylen)
+{
+ struct blake2s_state state;
+
+ if (keylen)
+ blake2s_init_key(&state, outlen, key, keylen);
+ else
+ blake2s_init(&state, outlen);
+
+ blake2s_update(&state, in, inlen);
+ blake2s_final(&state, out);
+}
+
+void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen,
+ const size_t inlen, const size_t keylen)
+{
+ struct blake2s_state state;
+ uint8_t x_key[BLAKE2S_BLOCK_SIZE] __aligned(sizeof(uint32_t)) = { 0 };
+ uint8_t i_hash[BLAKE2S_HASH_SIZE] __aligned(sizeof(uint32_t));
+ int i;
+
+ if (keylen > BLAKE2S_BLOCK_SIZE) {
+ blake2s_init(&state, BLAKE2S_HASH_SIZE);
+ blake2s_update(&state, key, keylen);
+ blake2s_final(&state, x_key);
+ } else
+ memcpy(x_key, key, keylen);
+
+ for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+ x_key[i] ^= 0x36;
+
+ blake2s_init(&state, BLAKE2S_HASH_SIZE);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+ blake2s_update(&state, in, inlen);
+ blake2s_final(&state, i_hash);
+
+ for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+ x_key[i] ^= 0x5c ^ 0x36;
+
+ blake2s_init(&state, BLAKE2S_HASH_SIZE);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+ blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
+ blake2s_final(&state, i_hash);
+
+ memcpy(out, i_hash, outlen);
+ explicit_bzero(x_key, BLAKE2S_BLOCK_SIZE);
+ explicit_bzero(i_hash, BLAKE2S_HASH_SIZE);
+}
+
+
+/* Below here is fiat's implementation of x25519.
+ *
+ * Copyright (C) 2015-2016 The fiat-crypto Authors.
+ * Copyright (C) 2018-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This is a machine-generated formally verified implementation of Curve25519
+ * ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally
+ * machine generated, it has been tweaked to be suitable for use in the kernel.
+ * It is optimized for 32-bit machines and machines that cannot work efficiently
+ * with 128-bit integer types.
+ */
+
+/* fe means field element. Here the field is \Z/(2^255-19). An element t,
+ * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
+ * t[3]+2^102 t[4]+...+2^230 t[9].
+ * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
+ * Multiplication and carrying produce fe from fe_loose.
+ */
+typedef struct fe { uint32_t v[10]; } fe;
+
+/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc
+ * Addition and subtraction produce fe_loose from (fe, fe).
+ */
+typedef struct fe_loose { uint32_t v[10]; } fe_loose;
+
+static inline void fe_frombytes_impl(uint32_t h[10], const uint8_t *s)
+{
+ /* Ignores top bit of s. */
+ uint32_t a0 = get_unaligned_le32(s);
+ uint32_t a1 = get_unaligned_le32(s+4);
+ uint32_t a2 = get_unaligned_le32(s+8);
+ uint32_t a3 = get_unaligned_le32(s+12);
+ uint32_t a4 = get_unaligned_le32(s+16);
+ uint32_t a5 = get_unaligned_le32(s+20);
+ uint32_t a6 = get_unaligned_le32(s+24);
+ uint32_t a7 = get_unaligned_le32(s+28);
+ h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */
+ h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */
+ h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */
+ h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */
+ h[4] = (a3>> 6); /* (32- 6) = 26 */
+ h[5] = a4&((1<<25)-1); /* 25 */
+ h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */
+ h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */
+ h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */
+ h[9] = (a7>> 6)&((1<<25)-1); /* 25 */
+}
+
+static inline void fe_frombytes(fe *h, const uint8_t *s)
+{
+ fe_frombytes_impl(h->v, s);
+}
+
+static inline uint8_t /*bool*/
+addcarryx_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low)
+{
+ /* This function extracts 25 bits of result and 1 bit of carry
+ * (26 total), so a 32-bit intermediate is sufficient.
+ */
+ uint32_t x = a + b + c;
+ *low = x & ((1 << 25) - 1);
+ return (x >> 25) & 1;
+}
+
+static inline uint8_t /*bool*/
+addcarryx_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low)
+{
+ /* This function extracts 26 bits of result and 1 bit of carry
+ * (27 total), so a 32-bit intermediate is sufficient.
+ */
+ uint32_t x = a + b + c;
+ *low = x & ((1 << 26) - 1);
+ return (x >> 26) & 1;
+}
+
+static inline uint8_t /*bool*/
+subborrow_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low)
+{
+ /* This function extracts 25 bits of result and 1 bit of borrow
+ * (26 total), so a 32-bit intermediate is sufficient.
+ */
+ uint32_t x = a - b - c;
+ *low = x & ((1 << 25) - 1);
+ return x >> 31;
+}
+
+static inline uint8_t /*bool*/
+subborrow_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low)
+{
+ /* This function extracts 26 bits of result and 1 bit of borrow
+ *(27 total), so a 32-bit intermediate is sufficient.
+ */
+ uint32_t x = a - b - c;
+ *low = x & ((1 << 26) - 1);
+ return x >> 31;
+}
+
+static inline uint32_t cmovznz32(uint32_t t, uint32_t z, uint32_t nz)
+{
+ t = -!!t; /* all set if nonzero, 0 if 0 */
+ return (t&nz) | ((~t)&z);
+}
+
+static inline void fe_freeze(uint32_t out[10], const uint32_t in1[10])
+{
+ const uint32_t x17 = in1[9];
+ const uint32_t x18 = in1[8];
+ const uint32_t x16 = in1[7];
+ const uint32_t x14 = in1[6];
+ const uint32_t x12 = in1[5];
+ const uint32_t x10 = in1[4];
+ const uint32_t x8 = in1[3];
+ const uint32_t x6 = in1[2];
+ const uint32_t x4 = in1[1];
+ const uint32_t x2 = in1[0];
+ uint32_t x20; uint8_t/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
+ uint32_t x23; uint8_t/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
+ uint32_t x26; uint8_t/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
+ uint32_t x29; uint8_t/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
+ uint32_t x32; uint8_t/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
+ uint32_t x35; uint8_t/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
+ uint32_t x38; uint8_t/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
+ uint32_t x41; uint8_t/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
+ uint32_t x44; uint8_t/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
+ uint32_t x47; uint8_t/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
+ uint32_t x49 = cmovznz32(x48, 0x0, 0xffffffff);
+ uint32_t x50 = (x49 & 0x3ffffed);
+ uint32_t x52; uint8_t/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
+ uint32_t x54 = (x49 & 0x1ffffff);
+ uint32_t x56; uint8_t/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56);
+ uint32_t x58 = (x49 & 0x3ffffff);
+ uint32_t x60; uint8_t/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60);
+ uint32_t x62 = (x49 & 0x1ffffff);
+ uint32_t x64; uint8_t/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64);
+ uint32_t x66 = (x49 & 0x3ffffff);
+ uint32_t x68; uint8_t/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68);
+ uint32_t x70 = (x49 & 0x1ffffff);
+ uint32_t x72; uint8_t/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72);
+ uint32_t x74 = (x49 & 0x3ffffff);
+ uint32_t x76; uint8_t/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76);
+ uint32_t x78 = (x49 & 0x1ffffff);
+ uint32_t x80; uint8_t/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80);
+ uint32_t x82 = (x49 & 0x3ffffff);
+ uint32_t x84; uint8_t/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84);
+ uint32_t x86 = (x49 & 0x1ffffff);
+ uint32_t x88; addcarryx_u25(x85, x47, x86, &x88);
+ out[0] = x52;
+ out[1] = x56;
+ out[2] = x60;
+ out[3] = x64;
+ out[4] = x68;
+ out[5] = x72;
+ out[6] = x76;
+ out[7] = x80;
+ out[8] = x84;
+ out[9] = x88;
+}
+
+static inline void fe_tobytes(uint8_t s[32], const fe *f)
+{
+ uint32_t h[10];
+ fe_freeze(h, f->v);
+ s[0] = h[0] >> 0;
+ s[1] = h[0] >> 8;
+ s[2] = h[0] >> 16;
+ s[3] = (h[0] >> 24) | (h[1] << 2);
+ s[4] = h[1] >> 6;
+ s[5] = h[1] >> 14;
+ s[6] = (h[1] >> 22) | (h[2] << 3);
+ s[7] = h[2] >> 5;
+ s[8] = h[2] >> 13;
+ s[9] = (h[2] >> 21) | (h[3] << 5);
+ s[10] = h[3] >> 3;
+ s[11] = h[3] >> 11;
+ s[12] = (h[3] >> 19) | (h[4] << 6);
+ s[13] = h[4] >> 2;
+ s[14] = h[4] >> 10;
+ s[15] = h[4] >> 18;
+ s[16] = h[5] >> 0;
+ s[17] = h[5] >> 8;
+ s[18] = h[5] >> 16;
+ s[19] = (h[5] >> 24) | (h[6] << 1);
+ s[20] = h[6] >> 7;
+ s[21] = h[6] >> 15;
+ s[22] = (h[6] >> 23) | (h[7] << 3);
+ s[23] = h[7] >> 5;
+ s[24] = h[7] >> 13;
+ s[25] = (h[7] >> 21) | (h[8] << 4);
+ s[26] = h[8] >> 4;
+ s[27] = h[8] >> 12;
+ s[28] = (h[8] >> 20) | (h[9] << 6);
+ s[29] = h[9] >> 2;
+ s[30] = h[9] >> 10;
+ s[31] = h[9] >> 18;
+}
+
+/* h = f */
+static inline void fe_copy(fe *h, const fe *f)
+{
+ memmove(h, f, sizeof(uint32_t) * 10);
+}
+
+static inline void fe_copy_lt(fe_loose *h, const fe *f)
+{
+ memmove(h, f, sizeof(uint32_t) * 10);
+}
+
+/* h = 0 */
+static inline void fe_0(fe *h)
+{
+ memset(h, 0, sizeof(uint32_t) * 10);
+}
+
+/* h = 1 */
+static inline void fe_1(fe *h)
+{
+ memset(h, 0, sizeof(uint32_t) * 10);
+ h->v[0] = 1;
+}
+
+static void fe_add_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10])
+{
+ const uint32_t x20 = in1[9];
+ const uint32_t x21 = in1[8];
+ const uint32_t x19 = in1[7];
+ const uint32_t x17 = in1[6];
+ const uint32_t x15 = in1[5];
+ const uint32_t x13 = in1[4];
+ const uint32_t x11 = in1[3];
+ const uint32_t x9 = in1[2];
+ const uint32_t x7 = in1[1];
+ const uint32_t x5 = in1[0];
+ const uint32_t x38 = in2[9];
+ const uint32_t x39 = in2[8];
+ const uint32_t x37 = in2[7];
+ const uint32_t x35 = in2[6];
+ const uint32_t x33 = in2[5];
+ const uint32_t x31 = in2[4];
+ const uint32_t x29 = in2[3];
+ const uint32_t x27 = in2[2];
+ const uint32_t x25 = in2[1];
+ const uint32_t x23 = in2[0];
+ out[0] = (x5 + x23);
+ out[1] = (x7 + x25);
+ out[2] = (x9 + x27);
+ out[3] = (x11 + x29);
+ out[4] = (x13 + x31);
+ out[5] = (x15 + x33);
+ out[6] = (x17 + x35);
+ out[7] = (x19 + x37);
+ out[8] = (x21 + x39);
+ out[9] = (x20 + x38);
+}
+
+/* h = f + g
+ * Can overlap h with f or g.
+ */
+static inline void fe_add(fe_loose *h, const fe *f, const fe *g)
+{
+ fe_add_impl(h->v, f->v, g->v);
+}
+
+static void fe_sub_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10])
+{
+ const uint32_t x20 = in1[9];
+ const uint32_t x21 = in1[8];
+ const uint32_t x19 = in1[7];
+ const uint32_t x17 = in1[6];
+ const uint32_t x15 = in1[5];
+ const uint32_t x13 = in1[4];
+ const uint32_t x11 = in1[3];
+ const uint32_t x9 = in1[2];
+ const uint32_t x7 = in1[1];
+ const uint32_t x5 = in1[0];
+ const uint32_t x38 = in2[9];
+ const uint32_t x39 = in2[8];
+ const uint32_t x37 = in2[7];
+ const uint32_t x35 = in2[6];
+ const uint32_t x33 = in2[5];
+ const uint32_t x31 = in2[4];
+ const uint32_t x29 = in2[3];
+ const uint32_t x27 = in2[2];
+ const uint32_t x25 = in2[1];
+ const uint32_t x23 = in2[0];
+ out[0] = ((0x7ffffda + x5) - x23);
+ out[1] = ((0x3fffffe + x7) - x25);
+ out[2] = ((0x7fffffe + x9) - x27);
+ out[3] = ((0x3fffffe + x11) - x29);
+ out[4] = ((0x7fffffe + x13) - x31);
+ out[5] = ((0x3fffffe + x15) - x33);
+ out[6] = ((0x7fffffe + x17) - x35);
+ out[7] = ((0x3fffffe + x19) - x37);
+ out[8] = ((0x7fffffe + x21) - x39);
+ out[9] = ((0x3fffffe + x20) - x38);
+}
+
+/* h = f - g
+ * Can overlap h with f or g.
+ */
+static inline void fe_sub(fe_loose *h, const fe *f, const fe *g)
+{
+ fe_sub_impl(h->v, f->v, g->v);
+}
+
+static void fe_mul_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10])
+{
+ const uint32_t x20 = in1[9];
+ const uint32_t x21 = in1[8];
+ const uint32_t x19 = in1[7];
+ const uint32_t x17 = in1[6];
+ const uint32_t x15 = in1[5];
+ const uint32_t x13 = in1[4];
+ const uint32_t x11 = in1[3];
+ const uint32_t x9 = in1[2];
+ const uint32_t x7 = in1[1];
+ const uint32_t x5 = in1[0];
+ const uint32_t x38 = in2[9];
+ const uint32_t x39 = in2[8];
+ const uint32_t x37 = in2[7];
+ const uint32_t x35 = in2[6];
+ const uint32_t x33 = in2[5];
+ const uint32_t x31 = in2[4];
+ const uint32_t x29 = in2[3];
+ const uint32_t x27 = in2[2];
+ const uint32_t x25 = in2[1];
+ const uint32_t x23 = in2[0];
+ uint64_t x40 = ((uint64_t)x23 * x5);
+ uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5));
+ uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5));
+ uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5));
+ uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5));
+ uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5));
+ uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5));
+ uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5));
+ uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5));
+ uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5));
+ uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9));
+ uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9));
+ uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13));
+ uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13));
+ uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17));
+ uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17));
+ uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19))));
+ uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21));
+ uint64_t x58 = ((uint64_t)(0x2 * x38) * x20);
+ uint64_t x59 = (x48 + (x58 << 0x4));
+ uint64_t x60 = (x59 + (x58 << 0x1));
+ uint64_t x61 = (x60 + x58);
+ uint64_t x62 = (x47 + (x57 << 0x4));
+ uint64_t x63 = (x62 + (x57 << 0x1));
+ uint64_t x64 = (x63 + x57);
+ uint64_t x65 = (x46 + (x56 << 0x4));
+ uint64_t x66 = (x65 + (x56 << 0x1));
+ uint64_t x67 = (x66 + x56);
+ uint64_t x68 = (x45 + (x55 << 0x4));
+ uint64_t x69 = (x68 + (x55 << 0x1));
+ uint64_t x70 = (x69 + x55);
+ uint64_t x71 = (x44 + (x54 << 0x4));
+ uint64_t x72 = (x71 + (x54 << 0x1));
+ uint64_t x73 = (x72 + x54);
+ uint64_t x74 = (x43 + (x53 << 0x4));
+ uint64_t x75 = (x74 + (x53 << 0x1));
+ uint64_t x76 = (x75 + x53);
+ uint64_t x77 = (x42 + (x52 << 0x4));
+ uint64_t x78 = (x77 + (x52 << 0x1));
+ uint64_t x79 = (x78 + x52);
+ uint64_t x80 = (x41 + (x51 << 0x4));
+ uint64_t x81 = (x80 + (x51 << 0x1));
+ uint64_t x82 = (x81 + x51);
+ uint64_t x83 = (x40 + (x50 << 0x4));
+ uint64_t x84 = (x83 + (x50 << 0x1));
+ uint64_t x85 = (x84 + x50);
+ uint64_t x86 = (x85 >> 0x1a);
+ uint32_t x87 = ((uint32_t)x85 & 0x3ffffff);
+ uint64_t x88 = (x86 + x82);
+ uint64_t x89 = (x88 >> 0x19);
+ uint32_t x90 = ((uint32_t)x88 & 0x1ffffff);
+ uint64_t x91 = (x89 + x79);
+ uint64_t x92 = (x91 >> 0x1a);
+ uint32_t x93 = ((uint32_t)x91 & 0x3ffffff);
+ uint64_t x94 = (x92 + x76);
+ uint64_t x95 = (x94 >> 0x19);
+ uint32_t x96 = ((uint32_t)x94 & 0x1ffffff);
+ uint64_t x97 = (x95 + x73);
+ uint64_t x98 = (x97 >> 0x1a);
+ uint32_t x99 = ((uint32_t)x97 & 0x3ffffff);
+ uint64_t x100 = (x98 + x70);
+ uint64_t x101 = (x100 >> 0x19);
+ uint32_t x102 = ((uint32_t)x100 & 0x1ffffff);
+ uint64_t x103 = (x101 + x67);
+ uint64_t x104 = (x103 >> 0x1a);
+ uint32_t x105 = ((uint32_t)x103 & 0x3ffffff);
+ uint64_t x106 = (x104 + x64);
+ uint64_t x107 = (x106 >> 0x19);
+ uint32_t x108 = ((uint32_t)x106 & 0x1ffffff);
+ uint64_t x109 = (x107 + x61);
+ uint64_t x110 = (x109 >> 0x1a);
+ uint32_t x111 = ((uint32_t)x109 & 0x3ffffff);
+ uint64_t x112 = (x110 + x49);
+ uint64_t x113 = (x112 >> 0x19);
+ uint32_t x114 = ((uint32_t)x112 & 0x1ffffff);
+ uint64_t x115 = (x87 + (0x13 * x113));
+ uint32_t x116 = (uint32_t) (x115 >> 0x1a);
+ uint32_t x117 = ((uint32_t)x115 & 0x3ffffff);
+ uint32_t x118 = (x116 + x90);
+ uint32_t x119 = (x118 >> 0x19);
+ uint32_t x120 = (x118 & 0x1ffffff);
+ out[0] = x117;
+ out[1] = x120;
+ out[2] = (x119 + x93);
+ out[3] = x96;
+ out[4] = x99;
+ out[5] = x102;
+ out[6] = x105;
+ out[7] = x108;
+ out[8] = x111;
+ out[9] = x114;
+}
+
+static inline void fe_mul_ttt(fe *h, const fe *f, const fe *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static inline void
+fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static void fe_sqr_impl(uint32_t out[10], const uint32_t in1[10])
+{
+ const uint32_t x17 = in1[9];
+ const uint32_t x18 = in1[8];
+ const uint32_t x16 = in1[7];
+ const uint32_t x14 = in1[6];
+ const uint32_t x12 = in1[5];
+ const uint32_t x10 = in1[4];
+ const uint32_t x8 = in1[3];
+ const uint32_t x6 = in1[2];
+ const uint32_t x4 = in1[1];
+ const uint32_t x2 = in1[0];
+ uint64_t x19 = ((uint64_t)x2 * x2);
+ uint64_t x20 = ((uint64_t)(0x2 * x2) * x4);
+ uint64_t x21 = (0x2 * (((uint64_t)x4 * x4) + ((uint64_t)x2 * x6)));
+ uint64_t x22 = (0x2 * (((uint64_t)x4 * x6) + ((uint64_t)x2 * x8)));
+ uint64_t x23 = ((((uint64_t)x6 * x6) + ((uint64_t)(0x4 * x4) * x8)) + ((uint64_t)(0x2 * x2) * x10));
+ uint64_t x24 = (0x2 * ((((uint64_t)x6 * x8) + ((uint64_t)x4 * x10)) + ((uint64_t)x2 * x12)));
+ uint64_t x25 = (0x2 * (((((uint64_t)x8 * x8) + ((uint64_t)x6 * x10)) + ((uint64_t)x2 * x14)) + ((uint64_t)(0x2 * x4) * x12)));
+ uint64_t x26 = (0x2 * (((((uint64_t)x8 * x10) + ((uint64_t)x6 * x12)) + ((uint64_t)x4 * x14)) + ((uint64_t)x2 * x16)));
+ uint64_t x27 = (((uint64_t)x10 * x10) + (0x2 * ((((uint64_t)x6 * x14) + ((uint64_t)x2 * x18)) + (0x2 * (((uint64_t)x4 * x16) + ((uint64_t)x8 * x12))))));
+ uint64_t x28 = (0x2 * ((((((uint64_t)x10 * x12) + ((uint64_t)x8 * x14)) + ((uint64_t)x6 * x16)) + ((uint64_t)x4 * x18)) + ((uint64_t)x2 * x17)));
+ uint64_t x29 = (0x2 * (((((uint64_t)x12 * x12) + ((uint64_t)x10 * x14)) + ((uint64_t)x6 * x18)) + (0x2 * (((uint64_t)x8 * x16) + ((uint64_t)x4 * x17)))));
+ uint64_t x30 = (0x2 * (((((uint64_t)x12 * x14) + ((uint64_t)x10 * x16)) + ((uint64_t)x8 * x18)) + ((uint64_t)x6 * x17)));
+ uint64_t x31 = (((uint64_t)x14 * x14) + (0x2 * (((uint64_t)x10 * x18) + (0x2 * (((uint64_t)x12 * x16) + ((uint64_t)x8 * x17))))));
+ uint64_t x32 = (0x2 * ((((uint64_t)x14 * x16) + ((uint64_t)x12 * x18)) + ((uint64_t)x10 * x17)));
+ uint64_t x33 = (0x2 * ((((uint64_t)x16 * x16) + ((uint64_t)x14 * x18)) + ((uint64_t)(0x2 * x12) * x17)));
+ uint64_t x34 = (0x2 * (((uint64_t)x16 * x18) + ((uint64_t)x14 * x17)));
+ uint64_t x35 = (((uint64_t)x18 * x18) + ((uint64_t)(0x4 * x16) * x17));
+ uint64_t x36 = ((uint64_t)(0x2 * x18) * x17);
+ uint64_t x37 = ((uint64_t)(0x2 * x17) * x17);
+ uint64_t x38 = (x27 + (x37 << 0x4));
+ uint64_t x39 = (x38 + (x37 << 0x1));
+ uint64_t x40 = (x39 + x37);
+ uint64_t x41 = (x26 + (x36 << 0x4));
+ uint64_t x42 = (x41 + (x36 << 0x1));
+ uint64_t x43 = (x42 + x36);
+ uint64_t x44 = (x25 + (x35 << 0x4));
+ uint64_t x45 = (x44 + (x35 << 0x1));
+ uint64_t x46 = (x45 + x35);
+ uint64_t x47 = (x24 + (x34 << 0x4));
+ uint64_t x48 = (x47 + (x34 << 0x1));
+ uint64_t x49 = (x48 + x34);
+ uint64_t x50 = (x23 + (x33 << 0x4));
+ uint64_t x51 = (x50 + (x33 << 0x1));
+ uint64_t x52 = (x51 + x33);
+ uint64_t x53 = (x22 + (x32 << 0x4));
+ uint64_t x54 = (x53 + (x32 << 0x1));
+ uint64_t x55 = (x54 + x32);
+ uint64_t x56 = (x21 + (x31 << 0x4));
+ uint64_t x57 = (x56 + (x31 << 0x1));
+ uint64_t x58 = (x57 + x31);
+ uint64_t x59 = (x20 + (x30 << 0x4));
+ uint64_t x60 = (x59 + (x30 << 0x1));
+ uint64_t x61 = (x60 + x30);
+ uint64_t x62 = (x19 + (x29 << 0x4));
+ uint64_t x63 = (x62 + (x29 << 0x1));
+ uint64_t x64 = (x63 + x29);
+ uint64_t x65 = (x64 >> 0x1a);
+ uint32_t x66 = ((uint32_t)x64 & 0x3ffffff);
+ uint64_t x67 = (x65 + x61);
+ uint64_t x68 = (x67 >> 0x19);
+ uint32_t x69 = ((uint32_t)x67 & 0x1ffffff);
+ uint64_t x70 = (x68 + x58);
+ uint64_t x71 = (x70 >> 0x1a);
+ uint32_t x72 = ((uint32_t)x70 & 0x3ffffff);
+ uint64_t x73 = (x71 + x55);
+ uint64_t x74 = (x73 >> 0x19);
+ uint32_t x75 = ((uint32_t)x73 & 0x1ffffff);
+ uint64_t x76 = (x74 + x52);
+ uint64_t x77 = (x76 >> 0x1a);
+ uint32_t x78 = ((uint32_t)x76 & 0x3ffffff);
+ uint64_t x79 = (x77 + x49);
+ uint64_t x80 = (x79 >> 0x19);
+ uint32_t x81 = ((uint32_t)x79 & 0x1ffffff);
+ uint64_t x82 = (x80 + x46);
+ uint64_t x83 = (x82 >> 0x1a);
+ uint32_t x84 = ((uint32_t)x82 & 0x3ffffff);
+ uint64_t x85 = (x83 + x43);
+ uint64_t x86 = (x85 >> 0x19);
+ uint32_t x87 = ((uint32_t)x85 & 0x1ffffff);
+ uint64_t x88 = (x86 + x40);
+ uint64_t x89 = (x88 >> 0x1a);
+ uint32_t x90 = ((uint32_t)x88 & 0x3ffffff);
+ uint64_t x91 = (x89 + x28);
+ uint64_t x92 = (x91 >> 0x19);
+ uint32_t x93 = ((uint32_t)x91 & 0x1ffffff);
+ uint64_t x94 = (x66 + (0x13 * x92));
+ uint32_t x95 = (uint32_t) (x94 >> 0x1a);
+ uint32_t x96 = ((uint32_t)x94 & 0x3ffffff);
+ uint32_t x97 = (x95 + x69);
+ uint32_t x98 = (x97 >> 0x19);
+ uint32_t x99 = (x97 & 0x1ffffff);
+ out[0] = x96;
+ out[1] = x99;
+ out[2] = (x98 + x72);
+ out[3] = x75;
+ out[4] = x78;
+ out[5] = x81;
+ out[6] = x84;
+ out[7] = x87;
+ out[8] = x90;
+ out[9] = x93;
+}
+
+static inline void fe_sq_tl(fe *h, const fe_loose *f)
+{
+ fe_sqr_impl(h->v, f->v);
+}
+
+static inline void fe_sq_tt(fe *h, const fe *f)
+{
+ fe_sqr_impl(h->v, f->v);
+}
+
+static inline void fe_loose_invert(fe *out, const fe_loose *z)
+{
+ fe t0;
+ fe t1;
+ fe t2;
+ fe t3;
+ int i;
+
+ fe_sq_tl(&t0, z);
+ fe_sq_tt(&t1, &t0);
+ for (i = 1; i < 2; ++i)
+ fe_sq_tt(&t1, &t1);
+ fe_mul_tlt(&t1, z, &t1);
+ fe_mul_ttt(&t0, &t0, &t1);
+ fe_sq_tt(&t2, &t0);
+ fe_mul_ttt(&t1, &t1, &t2);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 5; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 10; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t2, &t2, &t1);
+ fe_sq_tt(&t3, &t2);
+ for (i = 1; i < 20; ++i)
+ fe_sq_tt(&t3, &t3);
+ fe_mul_ttt(&t2, &t3, &t2);
+ fe_sq_tt(&t2, &t2);
+ for (i = 1; i < 10; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 50; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t2, &t2, &t1);
+ fe_sq_tt(&t3, &t2);
+ for (i = 1; i < 100; ++i)
+ fe_sq_tt(&t3, &t3);
+ fe_mul_ttt(&t2, &t3, &t2);
+ fe_sq_tt(&t2, &t2);
+ for (i = 1; i < 50; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t1, &t1);
+ for (i = 1; i < 5; ++i)
+ fe_sq_tt(&t1, &t1);
+ fe_mul_ttt(out, &t1, &t0);
+}
+
+static inline void fe_invert(fe *out, const fe *z)
+{
+ fe_loose l;
+ fe_copy_lt(&l, z);
+ fe_loose_invert(out, &l);
+}
+
+/* Replace (f,g) with (g,f) if b == 1;
+ * replace (f,g) with (f,g) if b == 0.
+ *
+ * Preconditions: b in {0,1}
+ */
+static inline void fe_cswap(fe *f, fe *g, unsigned int b)
+{
+ unsigned i;
+ b = 0 - b;
+ for (i = 0; i < 10; i++) {
+ uint32_t x = f->v[i] ^ g->v[i];
+ x &= b;
+ f->v[i] ^= x;
+ g->v[i] ^= x;
+ }
+}
+
+/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
+static inline void fe_mul_121666_impl(uint32_t out[10], const uint32_t in1[10])
+{
+ const uint32_t x20 = in1[9];
+ const uint32_t x21 = in1[8];
+ const uint32_t x19 = in1[7];
+ const uint32_t x17 = in1[6];
+ const uint32_t x15 = in1[5];
+ const uint32_t x13 = in1[4];
+ const uint32_t x11 = in1[3];
+ const uint32_t x9 = in1[2];
+ const uint32_t x7 = in1[1];
+ const uint32_t x5 = in1[0];
+ const uint32_t x38 = 0;
+ const uint32_t x39 = 0;
+ const uint32_t x37 = 0;
+ const uint32_t x35 = 0;
+ const uint32_t x33 = 0;
+ const uint32_t x31 = 0;
+ const uint32_t x29 = 0;
+ const uint32_t x27 = 0;
+ const uint32_t x25 = 0;
+ const uint32_t x23 = 121666;
+ uint64_t x40 = ((uint64_t)x23 * x5);
+ uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5));
+ uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5));
+ uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5));
+ uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5));
+ uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5));
+ uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5));
+ uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5));
+ uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5));
+ uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5));
+ uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9));
+ uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9));
+ uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13));
+ uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13));
+ uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17));
+ uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17));
+ uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19))));
+ uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21));
+ uint64_t x58 = ((uint64_t)(0x2 * x38) * x20);
+ uint64_t x59 = (x48 + (x58 << 0x4));
+ uint64_t x60 = (x59 + (x58 << 0x1));
+ uint64_t x61 = (x60 + x58);
+ uint64_t x62 = (x47 + (x57 << 0x4));
+ uint64_t x63 = (x62 + (x57 << 0x1));
+ uint64_t x64 = (x63 + x57);
+ uint64_t x65 = (x46 + (x56 << 0x4));
+ uint64_t x66 = (x65 + (x56 << 0x1));
+ uint64_t x67 = (x66 + x56);
+ uint64_t x68 = (x45 + (x55 << 0x4));
+ uint64_t x69 = (x68 + (x55 << 0x1));
+ uint64_t x70 = (x69 + x55);
+ uint64_t x71 = (x44 + (x54 << 0x4));
+ uint64_t x72 = (x71 + (x54 << 0x1));
+ uint64_t x73 = (x72 + x54);
+ uint64_t x74 = (x43 + (x53 << 0x4));
+ uint64_t x75 = (x74 + (x53 << 0x1));
+ uint64_t x76 = (x75 + x53);
+ uint64_t x77 = (x42 + (x52 << 0x4));
+ uint64_t x78 = (x77 + (x52 << 0x1));
+ uint64_t x79 = (x78 + x52);
+ uint64_t x80 = (x41 + (x51 << 0x4));
+ uint64_t x81 = (x80 + (x51 << 0x1));
+ uint64_t x82 = (x81 + x51);
+ uint64_t x83 = (x40 + (x50 << 0x4));
+ uint64_t x84 = (x83 + (x50 << 0x1));
+ uint64_t x85 = (x84 + x50);
+ uint64_t x86 = (x85 >> 0x1a);
+ uint32_t x87 = ((uint32_t)x85 & 0x3ffffff);
+ uint64_t x88 = (x86 + x82);
+ uint64_t x89 = (x88 >> 0x19);
+ uint32_t x90 = ((uint32_t)x88 & 0x1ffffff);
+ uint64_t x91 = (x89 + x79);
+ uint64_t x92 = (x91 >> 0x1a);
+ uint32_t x93 = ((uint32_t)x91 & 0x3ffffff);
+ uint64_t x94 = (x92 + x76);
+ uint64_t x95 = (x94 >> 0x19);
+ uint32_t x96 = ((uint32_t)x94 & 0x1ffffff);
+ uint64_t x97 = (x95 + x73);
+ uint64_t x98 = (x97 >> 0x1a);
+ uint32_t x99 = ((uint32_t)x97 & 0x3ffffff);
+ uint64_t x100 = (x98 + x70);
+ uint64_t x101 = (x100 >> 0x19);
+ uint32_t x102 = ((uint32_t)x100 & 0x1ffffff);
+ uint64_t x103 = (x101 + x67);
+ uint64_t x104 = (x103 >> 0x1a);
+ uint32_t x105 = ((uint32_t)x103 & 0x3ffffff);
+ uint64_t x106 = (x104 + x64);
+ uint64_t x107 = (x106 >> 0x19);
+ uint32_t x108 = ((uint32_t)x106 & 0x1ffffff);
+ uint64_t x109 = (x107 + x61);
+ uint64_t x110 = (x109 >> 0x1a);
+ uint32_t x111 = ((uint32_t)x109 & 0x3ffffff);
+ uint64_t x112 = (x110 + x49);
+ uint64_t x113 = (x112 >> 0x19);
+ uint32_t x114 = ((uint32_t)x112 & 0x1ffffff);
+ uint64_t x115 = (x87 + (0x13 * x113));
+ uint32_t x116 = (uint32_t) (x115 >> 0x1a);
+ uint32_t x117 = ((uint32_t)x115 & 0x3ffffff);
+ uint32_t x118 = (x116 + x90);
+ uint32_t x119 = (x118 >> 0x19);
+ uint32_t x120 = (x118 & 0x1ffffff);
+ out[0] = x117;
+ out[1] = x120;
+ out[2] = (x119 + x93);
+ out[3] = x96;
+ out[4] = x99;
+ out[5] = x102;
+ out[6] = x105;
+ out[7] = x108;
+ out[8] = x111;
+ out[9] = x114;
+}
+
+static inline void fe_mul121666(fe *h, const fe_loose *f)
+{
+ fe_mul_121666_impl(h->v, f->v);
+}
+
+static const uint8_t curve25519_null_point[CURVE25519_KEY_SIZE];
+
+bool curve25519(uint8_t out[CURVE25519_KEY_SIZE],
+ const uint8_t scalar[CURVE25519_KEY_SIZE],
+ const uint8_t point[CURVE25519_KEY_SIZE])
+{
+ fe x1, x2, z2, x3, z3;
+ fe_loose x2l, z2l, x3l;
+ unsigned swap = 0;
+ int pos;
+ uint8_t e[32];
+
+ memcpy(e, scalar, 32);
+ curve25519_clamp_secret(e);
+
+ /* The following implementation was transcribed to Coq and proven to
+ * correspond to unary scalar multiplication in affine coordinates given
+ * that x1 != 0 is the x coordinate of some point on the curve. It was
+ * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
+ * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
+ * quantified over the underlying field, so it applies to Curve25519
+ * itself and the quadratic twist of Curve25519. It was not proven in
+ * Coq that prime-field arithmetic correctly simulates extension-field
+ * arithmetic on prime-field values. The decoding of the byte array
+ * representation of e was not considered.
+ *
+ * Specification of Montgomery curves in affine coordinates:
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
+ *
+ * Proof that these form a group that is isomorphic to a Weierstrass
+ * curve:
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
+ *
+ * Coq transcription and correctness proof of the loop
+ * (where scalarbits=255):
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
+ * preconditions: 0 <= e < 2^255 (not necessarily e < order),
+ * fe_invert(0) = 0
+ */
+ fe_frombytes(&x1, point);
+ fe_1(&x2);
+ fe_0(&z2);
+ fe_copy(&x3, &x1);
+ fe_1(&z3);
+
+ for (pos = 254; pos >= 0; --pos) {
+ fe tmp0, tmp1;
+ fe_loose tmp0l, tmp1l;
+ /* loop invariant as of right before the test, for the case
+ * where x1 != 0:
+ * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
+ * is nonzero
+ * let r := e >> (pos+1) in the following equalities of
+ * projective points:
+ * to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
+ * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+ * x1 is the nonzero x coordinate of the nonzero
+ * point (r*P-(r+1)*P)
+ */
+ unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+ swap ^= b;
+ fe_cswap(&x2, &x3, swap);
+ fe_cswap(&z2, &z3, swap);
+ swap = b;
+ /* Coq transcription of ladderstep formula (called from
+ * transcribed loop):
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
+ * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
+ * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
+ * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
+ */
+ fe_sub(&tmp0l, &x3, &z3);
+ fe_sub(&tmp1l, &x2, &z2);
+ fe_add(&x2l, &x2, &z2);
+ fe_add(&z2l, &x3, &z3);
+ fe_mul_tll(&z3, &tmp0l, &x2l);
+ fe_mul_tll(&z2, &z2l, &tmp1l);
+ fe_sq_tl(&tmp0, &tmp1l);
+ fe_sq_tl(&tmp1, &x2l);
+ fe_add(&x3l, &z3, &z2);
+ fe_sub(&z2l, &z3, &z2);
+ fe_mul_ttt(&x2, &tmp1, &tmp0);
+ fe_sub(&tmp1l, &tmp1, &tmp0);
+ fe_sq_tl(&z2, &z2l);
+ fe_mul121666(&z3, &tmp1l);
+ fe_sq_tl(&x3, &x3l);
+ fe_add(&tmp0l, &tmp0, &z3);
+ fe_mul_ttt(&z3, &x1, &z2);
+ fe_mul_tll(&z2, &tmp1l, &tmp0l);
+ }
+ /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
+ * else (x2, z2)
+ */
+ fe_cswap(&x2, &x3, swap);
+ fe_cswap(&z2, &z3, swap);
+
+ fe_invert(&z2, &z2);
+ fe_mul_ttt(&x2, &x2, &z2);
+ fe_tobytes(out, &x2);
+
+ explicit_bzero(&x1, sizeof(x1));
+ explicit_bzero(&x2, sizeof(x2));
+ explicit_bzero(&z2, sizeof(z2));
+ explicit_bzero(&x3, sizeof(x3));
+ explicit_bzero(&z3, sizeof(z3));
+ explicit_bzero(&x2l, sizeof(x2l));
+ explicit_bzero(&z2l, sizeof(z2l));
+ explicit_bzero(&x3l, sizeof(x3l));
+ explicit_bzero(&e, sizeof(e));
+
+ return timingsafe_bcmp(out, curve25519_null_point, CURVE25519_KEY_SIZE) != 0;
+}
diff --git a/sys/dev/if_wg/crypto.h b/sys/dev/if_wg/crypto.h
new file mode 100644
index 000000000000..6e045c2fe0bf
--- /dev/null
+++ b/sys/dev/if_wg/crypto.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _WG_CRYPTO
+#define _WG_CRYPTO
+
+#include <sys/types.h>
+
+enum chacha20poly1305_lengths {
+ XCHACHA20POLY1305_NONCE_SIZE = 24,
+ CHACHA20POLY1305_KEY_SIZE = 32,
+ CHACHA20POLY1305_AUTHTAG_SIZE = 16
+};
+
+void
+chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
+ const uint8_t *ad, const size_t ad_len,
+ const uint64_t nonce,
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
+
+bool
+chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
+ const uint8_t *ad, const size_t ad_len,
+ const uint64_t nonce,
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
+
+void
+xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src,
+ const size_t src_len, const uint8_t *ad,
+ const size_t ad_len,
+ const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
+
+bool
+xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src,
+ const size_t src_len, const uint8_t *ad,
+ const size_t ad_len,
+ const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
+ const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
+
+
+enum blake2s_lengths {
+ BLAKE2S_BLOCK_SIZE = 64,
+ BLAKE2S_HASH_SIZE = 32,
+ BLAKE2S_KEY_SIZE = 32
+};
+
+struct blake2s_state {
+ uint32_t h[8];
+ uint32_t t[2];
+ uint32_t f[2];
+ uint8_t buf[BLAKE2S_BLOCK_SIZE];
+ unsigned int buflen;
+ unsigned int outlen;
+};
+
+void blake2s_init(struct blake2s_state *state, const size_t outlen);
+
+void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
+ const uint8_t *key, const size_t keylen);
+
+void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
+
+void blake2s_final(struct blake2s_state *state, uint8_t *out);
+
+void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
+ const size_t outlen, const size_t inlen, const size_t keylen);
+
+void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key,
+ const size_t outlen, const size_t inlen, const size_t keylen);
+
+enum curve25519_lengths {
+ CURVE25519_KEY_SIZE = 32
+};
+
+bool curve25519(uint8_t mypublic[static CURVE25519_KEY_SIZE],
+ const uint8_t secret[static CURVE25519_KEY_SIZE],
+ const uint8_t basepoint[static CURVE25519_KEY_SIZE]);
+
+static inline bool
+curve25519_generate_public(uint8_t pub[static CURVE25519_KEY_SIZE],
+ const uint8_t secret[static CURVE25519_KEY_SIZE])
+{
+ static const uint8_t basepoint[CURVE25519_KEY_SIZE] = { 9 };
+
+ return curve25519(pub, secret, basepoint);
+}
+
+static inline void curve25519_clamp_secret(uint8_t secret[static CURVE25519_KEY_SIZE])
+{
+ secret[0] &= 248;
+ secret[31] = (secret[31] & 127) | 64;
+}
+
+static inline void curve25519_generate_secret(uint8_t secret[CURVE25519_KEY_SIZE])
+{
+ arc4random_buf(secret, CURVE25519_KEY_SIZE);
+ curve25519_clamp_secret(secret);
+}
+
+#endif
diff --git a/sys/dev/if_wg/if_wg.c b/sys/dev/if_wg/if_wg.c
new file mode 100644
index 000000000000..ba2eb3221fac
--- /dev/null
+++ b/sys/dev/if_wg/if_wg.c
@@ -0,0 +1,3454 @@
+/*
+ * Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (C) 2019-2021 Matt Dunwoodie <ncon@noconroy.net>
+ * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
+ * Copyright (c) 2021 Kyle Evans <kevans@FreeBSD.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* TODO audit imports */
+#include "opt_inet.h"
+#include "opt_inet6.h"
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <vm/uma.h>
+
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/kernel.h>
+
+#include <sys/sockio.h>
+#include <sys/socketvar.h>
+#include <sys/errno.h>
+#include <sys/jail.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/rwlock.h>
+#include <sys/rmlock.h>
+#include <sys/protosw.h>
+#include <sys/module.h>
+#include <sys/endian.h>
+#include <sys/kdb.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/gtaskqueue.h>
+#include <sys/smp.h>
+#include <sys/nv.h>
+
+#include <net/bpf.h>
+
+#include <sys/syslog.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/if_clone.h>
+#include <net/if_types.h>
+#include <net/ethernet.h>
+#include <net/radix.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/scope6_var.h>
+#include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
+#include <netinet/in_pcb.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet/udp_var.h>
+
+#include <machine/in_cksum.h>
+
+#include "support.h"
+#include "wg_noise.h"
+#include "wg_cookie.h"
+#include "if_wg.h"
+
+/* It'd be nice to use IF_MAXMTU, but that means more complicated mbuf allocations,
+ * so instead just do the biggest mbuf we can easily allocate minus the usual maximum
+ * IPv6 overhead of 80 bytes. If somebody wants bigger frames, we can revisit this. */
+#define MAX_MTU (MJUM16BYTES - 80)
+
+#define DEFAULT_MTU 1420
+
+#define MAX_STAGED_PKT 128
+#define MAX_QUEUED_PKT 1024
+#define MAX_QUEUED_PKT_MASK (MAX_QUEUED_PKT - 1)
+
+#define MAX_QUEUED_HANDSHAKES 4096
+
+#define HASHTABLE_PEER_SIZE (1 << 11)
+#define HASHTABLE_INDEX_SIZE (1 << 13)
+#define MAX_PEERS_PER_IFACE (1 << 20)
+
+#define REKEY_TIMEOUT 5
+#define REKEY_TIMEOUT_JITTER 334 /* 1/3 sec, round for arc4random_uniform */
+#define KEEPALIVE_TIMEOUT 10
+#define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT)
+#define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT)
+#define UNDERLOAD_TIMEOUT 1
+
+#define DPRINTF(sc, ...) if (wireguard_debug) if_printf(sc->sc_ifp, ##__VA_ARGS__)
+
+/* First byte indicating packet type on the wire */
+#define WG_PKT_INITIATION htole32(1)
+#define WG_PKT_RESPONSE htole32(2)
+#define WG_PKT_COOKIE htole32(3)
+#define WG_PKT_DATA htole32(4)
+
+#define WG_PKT_WITH_PADDING(n) (((n) + (16-1)) & (~(16-1)))
+#define WG_KEY_SIZE 32
+
+struct wg_pkt_initiation {
+ uint32_t t;
+ uint32_t s_idx;
+ uint8_t ue[NOISE_PUBLIC_KEY_LEN];
+ uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN];
+ uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN];
+ struct cookie_macs m;
+};
+
+struct wg_pkt_response {
+ uint32_t t;
+ uint32_t s_idx;
+ uint32_t r_idx;
+ uint8_t ue[NOISE_PUBLIC_KEY_LEN];
+ uint8_t en[0 + NOISE_AUTHTAG_LEN];
+ struct cookie_macs m;
+};
+
+struct wg_pkt_cookie {
+ uint32_t t;
+ uint32_t r_idx;
+ uint8_t nonce[COOKIE_NONCE_SIZE];
+ uint8_t ec[COOKIE_ENCRYPTED_SIZE];
+};
+
+struct wg_pkt_data {
+ uint32_t t;
+ uint32_t r_idx;
+ uint8_t nonce[sizeof(uint64_t)];
+ uint8_t buf[];
+};
+
+struct wg_endpoint {
+ union {
+ struct sockaddr r_sa;
+ struct sockaddr_in r_sin;
+#ifdef INET6
+ struct sockaddr_in6 r_sin6;
+#endif
+ } e_remote;
+ union {
+ struct in_addr l_in;
+#ifdef INET6
+ struct in6_pktinfo l_pktinfo6;
+#define l_in6 l_pktinfo6.ipi6_addr
+#endif
+ } e_local;
+};
+
+struct wg_tag {
+ struct m_tag t_tag;
+ struct wg_endpoint t_endpoint;
+ struct wg_peer *t_peer;
+ struct mbuf *t_mbuf;
+ int t_done;
+ int t_mtu;
+};
+
+struct wg_index {
+ LIST_ENTRY(wg_index) i_entry;
+ SLIST_ENTRY(wg_index) i_unused_entry;
+ uint32_t i_key;
+ struct noise_remote *i_value;
+};
+
+struct wg_timers {
+ /* t_lock is for blocking wg_timers_event_* when setting t_disabled. */
+ struct rwlock t_lock;
+
+ int t_disabled;
+ int t_need_another_keepalive;
+ uint16_t t_persistent_keepalive_interval;
+ struct callout t_new_handshake;
+ struct callout t_send_keepalive;
+ struct callout t_retry_handshake;
+ struct callout t_zero_key_material;
+ struct callout t_persistent_keepalive;
+
+ struct mtx t_handshake_mtx;
+ struct timespec t_handshake_last_sent;
+ struct timespec t_handshake_complete;
+ volatile int t_handshake_retries;
+};
+
+struct wg_aip {
+ struct radix_node r_nodes[2];
+ CK_LIST_ENTRY(wg_aip) r_entry;
+ struct sockaddr_storage r_addr;
+ struct sockaddr_storage r_mask;
+ struct wg_peer *r_peer;
+};
+
+struct wg_queue {
+ struct mtx q_mtx;
+ struct mbufq q;
+};
+
+struct wg_peer {
+ CK_LIST_ENTRY(wg_peer) p_hash_entry;
+ CK_LIST_ENTRY(wg_peer) p_entry;
+ uint64_t p_id;
+ struct wg_softc *p_sc;
+
+ struct noise_remote p_remote;
+ struct cookie_maker p_cookie;
+ struct wg_timers p_timers;
+
+ struct rwlock p_endpoint_lock;
+ struct wg_endpoint p_endpoint;
+
+ SLIST_HEAD(,wg_index) p_unused_index;
+ struct wg_index p_index[3];
+
+ struct wg_queue p_stage_queue;
+ struct wg_queue p_encap_queue;
+ struct wg_queue p_decap_queue;
+
+ struct grouptask p_clear_secrets;
+ struct grouptask p_send_initiation;
+ struct grouptask p_send_keepalive;
+ struct grouptask p_send;
+ struct grouptask p_recv;
+
+ counter_u64_t p_tx_bytes;
+ counter_u64_t p_rx_bytes;
+
+ CK_LIST_HEAD(, wg_aip) p_aips;
+ struct mtx p_lock;
+ struct epoch_context p_ctx;
+};
+
+enum route_direction {
+ /* TODO OpenBSD doesn't use IN/OUT, instead passes the address buffer
+ * directly to route_lookup. */
+ IN,
+ OUT,
+};
+
+struct wg_aip_table {
+ size_t t_count;
+ struct radix_node_head *t_ip;
+ struct radix_node_head *t_ip6;
+};
+
+struct wg_allowedip {
+ uint16_t family;
+ union {
+ struct in_addr ip4;
+ struct in6_addr ip6;
+ };
+ uint8_t cidr;
+};
+
+struct wg_hashtable {
+ struct mtx h_mtx;
+ SIPHASH_KEY h_secret;
+ CK_LIST_HEAD(, wg_peer) h_peers_list;
+ CK_LIST_HEAD(, wg_peer) *h_peers;
+ u_long h_peers_mask;
+ size_t h_num_peers;
+};
+
+struct wg_socket {
+ struct mtx so_mtx;
+ struct socket *so_so4;
+ struct socket *so_so6;
+ uint32_t so_user_cookie;
+ in_port_t so_port;
+};
+
+struct wg_softc {
+ LIST_ENTRY(wg_softc) sc_entry;
+ struct ifnet *sc_ifp;
+ int sc_flags;
+
+ struct ucred *sc_ucred;
+ struct wg_socket sc_socket;
+ struct wg_hashtable sc_hashtable;
+ struct wg_aip_table sc_aips;
+
+ struct mbufq sc_handshake_queue;
+ struct grouptask sc_handshake;
+
+ struct noise_local sc_local;
+ struct cookie_checker sc_cookie;
+
+ struct buf_ring *sc_encap_ring;
+ struct buf_ring *sc_decap_ring;
+
+ struct grouptask *sc_encrypt;
+ struct grouptask *sc_decrypt;
+
+ struct rwlock sc_index_lock;
+ LIST_HEAD(,wg_index) *sc_index;
+ u_long sc_index_mask;
+
+ struct sx sc_lock;
+ volatile u_int sc_peer_count;
+};
+
+#define WGF_DYING 0x0001
+
+/* TODO the following defines are freebsd specific, we should see what is
+ * necessary and cleanup from there (i suspect a lot can be junked). */
+
+#ifndef ENOKEY
+#define ENOKEY ENOTCAPABLE
+#endif
+
+#if __FreeBSD_version > 1300000
+typedef void timeout_t (void *);
+#endif
+
+#define GROUPTASK_DRAIN(gtask) \
+ gtaskqueue_drain((gtask)->gt_taskqueue, &(gtask)->gt_task)
+
+#define MTAG_WIREGUARD 0xBEAD
+#define M_ENQUEUED M_PROTO1
+
+static int clone_count;
+static uma_zone_t ratelimit_zone;
+static int wireguard_debug;
+static volatile unsigned long peer_counter = 0;
+static const char wgname[] = "wg";
+static unsigned wg_osd_jail_slot;
+
+static struct sx wg_sx;
+SX_SYSINIT(wg_sx, &wg_sx, "wg_sx");
+
+static LIST_HEAD(, wg_softc) wg_list = LIST_HEAD_INITIALIZER(wg_list);
+
+SYSCTL_NODE(_net, OID_AUTO, wg, CTLFLAG_RW, 0, "WireGuard");
+SYSCTL_INT(_net_wg, OID_AUTO, debug, CTLFLAG_RWTUN, &wireguard_debug, 0,
+ "enable debug logging");
+
+TASKQGROUP_DECLARE(if_io_tqg);
+
+MALLOC_DEFINE(M_WG, "WG", "wireguard");
+VNET_DEFINE_STATIC(struct if_clone *, wg_cloner);
+
+
+#define V_wg_cloner VNET(wg_cloner)
+#define WG_CAPS IFCAP_LINKSTATE
+#define ph_family PH_loc.eight[5]
+
+struct wg_timespec64 {
+ uint64_t tv_sec;
+ uint64_t tv_nsec;
+};
+
+struct wg_peer_export {
+ struct sockaddr_storage endpoint;
+ struct timespec last_handshake;
+ uint8_t public_key[WG_KEY_SIZE];
+ uint8_t preshared_key[NOISE_SYMMETRIC_KEY_LEN];
+ size_t endpoint_sz;
+ struct wg_allowedip *aip;
+ uint64_t rx_bytes;
+ uint64_t tx_bytes;
+ int aip_count;
+ uint16_t persistent_keepalive;
+};
+
+static struct wg_tag *wg_tag_get(struct mbuf *);
+static struct wg_endpoint *wg_mbuf_endpoint_get(struct mbuf *);
+static int wg_socket_init(struct wg_softc *, in_port_t);
+static int wg_socket_bind(struct socket *, struct socket *, in_port_t *);
+static void wg_socket_set(struct wg_softc *, struct socket *, struct socket *);
+static void wg_socket_uninit(struct wg_softc *);
+static void wg_socket_set_cookie(struct wg_softc *, uint32_t);
+static int wg_send(struct wg_softc *, struct wg_endpoint *, struct mbuf *);
+static void wg_timers_event_data_sent(struct wg_timers *);
+static void wg_timers_event_data_received(struct wg_timers *);
+static void wg_timers_event_any_authenticated_packet_sent(struct wg_timers *);
+static void wg_timers_event_any_authenticated_packet_received(struct wg_timers *);
+static void wg_timers_event_any_authenticated_packet_traversal(struct wg_timers *);
+static void wg_timers_event_handshake_initiated(struct wg_timers *);
+static void wg_timers_event_handshake_responded(struct wg_timers *);
+static void wg_timers_event_handshake_complete(struct wg_timers *);
+static void wg_timers_event_session_derived(struct wg_timers *);
+static void wg_timers_event_want_initiation(struct wg_timers *);
+static void wg_timers_event_reset_handshake_last_sent(struct wg_timers *);
+static void wg_timers_run_send_initiation(struct wg_timers *, int);
+static void wg_timers_run_retry_handshake(struct wg_timers *);
+static void wg_timers_run_send_keepalive(struct wg_timers *);
+static void wg_timers_run_new_handshake(struct wg_timers *);
+static void wg_timers_run_zero_key_material(struct wg_timers *);
+static void wg_timers_run_persistent_keepalive(struct wg_timers *);
+static void wg_timers_init(struct wg_timers *);
+static void wg_timers_enable(struct wg_timers *);
+static void wg_timers_disable(struct wg_timers *);
+static void wg_timers_set_persistent_keepalive(struct wg_timers *, uint16_t);
+static void wg_timers_get_last_handshake(struct wg_timers *, struct timespec *);
+static int wg_timers_expired_handshake_last_sent(struct wg_timers *);
+static int wg_timers_check_handshake_last_sent(struct wg_timers *);
+static void wg_queue_init(struct wg_queue *, const char *);
+static void wg_queue_deinit(struct wg_queue *);
+static void wg_queue_purge(struct wg_queue *);
+static struct mbuf *wg_queue_dequeue(struct wg_queue *, struct wg_tag **);
+static int wg_queue_len(struct wg_queue *);
+static int wg_queue_in(struct wg_peer *, struct mbuf *);
+static void wg_queue_out(struct wg_peer *);
+static void wg_queue_stage(struct wg_peer *, struct mbuf *);
+static int wg_aip_init(struct wg_aip_table *);
+static void wg_aip_destroy(struct wg_aip_table *);
+static void wg_aip_populate_aip4(struct wg_aip *, const struct in_addr *, uint8_t);
+static void wg_aip_populate_aip6(struct wg_aip *, const struct in6_addr *, uint8_t);
+static int wg_aip_add(struct wg_aip_table *, struct wg_peer *, const struct wg_allowedip *);
+static int wg_peer_remove(struct radix_node *, void *);
+static void wg_peer_remove_all(struct wg_softc *);
+static int wg_aip_delete(struct wg_aip_table *, struct wg_peer *);
+static struct wg_peer *wg_aip_lookup(struct wg_aip_table *, struct mbuf *, enum route_direction);
+static void wg_hashtable_init(struct wg_hashtable *);
+static void wg_hashtable_destroy(struct wg_hashtable *);
+static void wg_hashtable_peer_insert(struct wg_hashtable *, struct wg_peer *);
+static struct wg_peer *wg_peer_lookup(struct wg_softc *, const uint8_t [32]);
+static void wg_hashtable_peer_remove(struct wg_hashtable *, struct wg_peer *);
+static int wg_cookie_validate_packet(struct cookie_checker *, struct mbuf *, int);
+static struct wg_peer *wg_peer_alloc(struct wg_softc *);
+static void wg_peer_free_deferred(epoch_context_t);
+static void wg_peer_destroy(struct wg_peer *);
+static void wg_peer_send_buf(struct wg_peer *, uint8_t *, size_t);
+static void wg_send_initiation(struct wg_peer *);
+static void wg_send_response(struct wg_peer *);
+static void wg_send_cookie(struct wg_softc *, struct cookie_macs *, uint32_t, struct mbuf *);
+static void wg_peer_set_endpoint_from_tag(struct wg_peer *, struct wg_tag *);
+static void wg_peer_clear_src(struct wg_peer *);
+static void wg_peer_get_endpoint(struct wg_peer *, struct wg_endpoint *);
+static void wg_deliver_out(struct wg_peer *);
+static void wg_deliver_in(struct wg_peer *);
+static void wg_send_buf(struct wg_softc *, struct wg_endpoint *, uint8_t *, size_t);
+static void wg_send_keepalive(struct wg_peer *);
+static void wg_handshake(struct wg_softc *, struct mbuf *);
+static void wg_encap(struct wg_softc *, struct mbuf *);
+static void wg_decap(struct wg_softc *, struct mbuf *);
+static void wg_softc_handshake_receive(struct wg_softc *);
+static void wg_softc_decrypt(struct wg_softc *);
+static void wg_softc_encrypt(struct wg_softc *);
+static struct noise_remote *wg_remote_get(struct wg_softc *, uint8_t [NOISE_PUBLIC_KEY_LEN]);
+static uint32_t wg_index_set(struct wg_softc *, struct noise_remote *);
+static struct noise_remote *wg_index_get(struct wg_softc *, uint32_t);
+static void wg_index_drop(struct wg_softc *, uint32_t);
+static int wg_update_endpoint_addrs(struct wg_endpoint *, const struct sockaddr *, struct ifnet *);
+static void wg_input(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *);
+static void wg_encrypt_dispatch(struct wg_softc *);
+static void wg_decrypt_dispatch(struct wg_softc *);
+static void crypto_taskq_setup(struct wg_softc *);
+static void crypto_taskq_destroy(struct wg_softc *);
+static int wg_clone_create(struct if_clone *, int, caddr_t);
+static void wg_qflush(struct ifnet *);
+static int wg_transmit(struct ifnet *, struct mbuf *);
+static int wg_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *);
+static void wg_clone_destroy(struct ifnet *);
+static int wg_peer_to_export(struct wg_peer *, struct wg_peer_export *);
+static bool wgc_privileged(struct wg_softc *);
+static int wgc_get(struct wg_softc *, struct wg_data_io *);
+static int wgc_set(struct wg_softc *, struct wg_data_io *);
+static int wg_up(struct wg_softc *);
+static void wg_down(struct wg_softc *);
+static void wg_reassign(struct ifnet *, struct vnet *, char *unused);
+static void wg_init(void *);
+static int wg_ioctl(struct ifnet *, u_long, caddr_t);
+static void vnet_wg_init(const void *);
+static void vnet_wg_uninit(const void *);
+static void wg_module_init(void);
+static void wg_module_deinit(void);
+
+/* TODO Peer */
+static struct wg_peer *
+wg_peer_alloc(struct wg_softc *sc)
+{
+ struct wg_peer *peer;
+
+ sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+ peer = malloc(sizeof(*peer), M_WG, M_WAITOK|M_ZERO);
+ peer->p_sc = sc;
+ peer->p_id = peer_counter++;
+ CK_LIST_INIT(&peer->p_aips);
+
+ rw_init(&peer->p_endpoint_lock, "wg_peer_endpoint");
+ wg_queue_init(&peer->p_stage_queue, "stageq");
+ wg_queue_init(&peer->p_encap_queue, "txq");
+ wg_queue_init(&peer->p_decap_queue, "rxq");
+
+ GROUPTASK_INIT(&peer->p_send_initiation, 0, (gtask_fn_t *)wg_send_initiation, peer);
+ taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send_initiation, peer, NULL, NULL, "wg initiation");
+ GROUPTASK_INIT(&peer->p_send_keepalive, 0, (gtask_fn_t *)wg_send_keepalive, peer);
+ taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send_keepalive, peer, NULL, NULL, "wg keepalive");
+ GROUPTASK_INIT(&peer->p_clear_secrets, 0, (gtask_fn_t *)noise_remote_clear, &peer->p_remote);
+ taskqgroup_attach(qgroup_if_io_tqg, &peer->p_clear_secrets,
+ &peer->p_remote, NULL, NULL, "wg clear secrets");
+
+ GROUPTASK_INIT(&peer->p_send, 0, (gtask_fn_t *)wg_deliver_out, peer);
+ taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send, peer, NULL, NULL, "wg send");
+ GROUPTASK_INIT(&peer->p_recv, 0, (gtask_fn_t *)wg_deliver_in, peer);
+ taskqgroup_attach(qgroup_if_io_tqg, &peer->p_recv, peer, NULL, NULL, "wg recv");
+
+ wg_timers_init(&peer->p_timers);
+
+ peer->p_tx_bytes = counter_u64_alloc(M_WAITOK);
+ peer->p_rx_bytes = counter_u64_alloc(M_WAITOK);
+
+ SLIST_INIT(&peer->p_unused_index);
+ SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[0],
+ i_unused_entry);
+ SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[1],
+ i_unused_entry);
+ SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[2],
+ i_unused_entry);
+
+ return (peer);
+}
+
+#define WG_HASHTABLE_PEER_FOREACH(peer, i, ht) \
+ for (i = 0; i < HASHTABLE_PEER_SIZE; i++) \
+ LIST_FOREACH(peer, &(ht)->h_peers[i], p_hash_entry)
+#define WG_HASHTABLE_PEER_FOREACH_SAFE(peer, i, ht, tpeer) \
+ for (i = 0; i < HASHTABLE_PEER_SIZE; i++) \
+ CK_LIST_FOREACH_SAFE(peer, &(ht)->h_peers[i], p_hash_entry, tpeer)
+static void
+wg_hashtable_init(struct wg_hashtable *ht)
+{
+ mtx_init(&ht->h_mtx, "hash lock", NULL, MTX_DEF);
+ arc4random_buf(&ht->h_secret, sizeof(ht->h_secret));
+ ht->h_num_peers = 0;
+ ht->h_peers = hashinit(HASHTABLE_PEER_SIZE, M_DEVBUF,
+ &ht->h_peers_mask);
+}
+
+static void
+wg_hashtable_destroy(struct wg_hashtable *ht)
+{
+ MPASS(ht->h_num_peers == 0);
+ mtx_destroy(&ht->h_mtx);
+ hashdestroy(ht->h_peers, M_DEVBUF, ht->h_peers_mask);
+}
+
+static void
+wg_hashtable_peer_insert(struct wg_hashtable *ht, struct wg_peer *peer)
+{
+ uint64_t key;
+
+ key = siphash24(&ht->h_secret, peer->p_remote.r_public,
+ sizeof(peer->p_remote.r_public));
+
+ mtx_lock(&ht->h_mtx);
+ ht->h_num_peers++;
+ CK_LIST_INSERT_HEAD(&ht->h_peers[key & ht->h_peers_mask], peer, p_hash_entry);
+ CK_LIST_INSERT_HEAD(&ht->h_peers_list, peer, p_entry);
+ mtx_unlock(&ht->h_mtx);
+}
+
+static struct wg_peer *
+wg_peer_lookup(struct wg_softc *sc,
+ const uint8_t pubkey[WG_KEY_SIZE])
+{
+ struct wg_hashtable *ht = &sc->sc_hashtable;
+ uint64_t key;
+ struct wg_peer *i = NULL;
+
+ key = siphash24(&ht->h_secret, pubkey, WG_KEY_SIZE);
+
+ mtx_lock(&ht->h_mtx);
+ CK_LIST_FOREACH(i, &ht->h_peers[key & ht->h_peers_mask], p_hash_entry) {
+ if (timingsafe_bcmp(i->p_remote.r_public, pubkey,
+ WG_KEY_SIZE) == 0)
+ break;
+ }
+ mtx_unlock(&ht->h_mtx);
+
+ return i;
+}
+
+static void
+wg_hashtable_peer_remove(struct wg_hashtable *ht, struct wg_peer *peer)
+{
+ mtx_lock(&ht->h_mtx);
+ ht->h_num_peers--;
+ CK_LIST_REMOVE(peer, p_hash_entry);
+ CK_LIST_REMOVE(peer, p_entry);
+ mtx_unlock(&ht->h_mtx);
+}
+
+static void
+wg_peer_free_deferred(epoch_context_t ctx)
+{
+ struct wg_peer *peer = __containerof(ctx, struct wg_peer, p_ctx);
+ counter_u64_free(peer->p_tx_bytes);
+ counter_u64_free(peer->p_rx_bytes);
+ rw_destroy(&peer->p_timers.t_lock);
+ rw_destroy(&peer->p_endpoint_lock);
+ free(peer, M_WG);
+}
+
+static void
+wg_peer_destroy(struct wg_peer *peer)
+{
+ /* Callers should already have called:
+ * wg_hashtable_peer_remove(&sc->sc_hashtable, peer);
+ */
+ wg_aip_delete(&peer->p_sc->sc_aips, peer);
+ MPASS(CK_LIST_EMPTY(&peer->p_aips));
+
+ /* We disable all timers, so we can't call the following tasks. */
+ wg_timers_disable(&peer->p_timers);
+
+ /* Ensure the tasks have finished running */
+ GROUPTASK_DRAIN(&peer->p_clear_secrets);
+ GROUPTASK_DRAIN(&peer->p_send_initiation);
+ GROUPTASK_DRAIN(&peer->p_send_keepalive);
+ GROUPTASK_DRAIN(&peer->p_recv);
+ GROUPTASK_DRAIN(&peer->p_send);
+
+ taskqgroup_detach(qgroup_if_io_tqg, &peer->p_clear_secrets);
+ taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send_initiation);
+ taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send_keepalive);
+ taskqgroup_detach(qgroup_if_io_tqg, &peer->p_recv);
+ taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send);
+
+ wg_queue_deinit(&peer->p_decap_queue);
+ wg_queue_deinit(&peer->p_encap_queue);
+ wg_queue_deinit(&peer->p_stage_queue);
+
+ /* Final cleanup */
+ --peer->p_sc->sc_peer_count;
+ noise_remote_clear(&peer->p_remote);
+ DPRINTF(peer->p_sc, "Peer %llu destroyed\n", (unsigned long long)peer->p_id);
+ NET_EPOCH_CALL(wg_peer_free_deferred, &peer->p_ctx);
+}
+
+static void
+wg_peer_set_endpoint_from_tag(struct wg_peer *peer, struct wg_tag *t)
+{
+ struct wg_endpoint *e = &t->t_endpoint;
+
+ MPASS(e->e_remote.r_sa.sa_family != 0);
+ if (memcmp(e, &peer->p_endpoint, sizeof(*e)) == 0)
+ return;
+
+ peer->p_endpoint = *e;
+}
+
+static void
+wg_peer_clear_src(struct wg_peer *peer)
+{
+ rw_rlock(&peer->p_endpoint_lock);
+ bzero(&peer->p_endpoint.e_local, sizeof(peer->p_endpoint.e_local));
+ rw_runlock(&peer->p_endpoint_lock);
+}
+
+static void
+wg_peer_get_endpoint(struct wg_peer *p, struct wg_endpoint *e)
+{
+ memcpy(e, &p->p_endpoint, sizeof(*e));
+}
+
+/* Allowed IP */
+static int
+wg_aip_init(struct wg_aip_table *tbl)
+{
+ int rc;
+
+ tbl->t_count = 0;
+ rc = rn_inithead((void **)&tbl->t_ip,
+ offsetof(struct sockaddr_in, sin_addr) * NBBY);
+
+ if (rc == 0)
+ return (ENOMEM);
+ RADIX_NODE_HEAD_LOCK_INIT(tbl->t_ip);
+#ifdef INET6
+ rc = rn_inithead((void **)&tbl->t_ip6,
+ offsetof(struct sockaddr_in6, sin6_addr) * NBBY);
+ if (rc == 0) {
+ free(tbl->t_ip, M_RTABLE);
+ return (ENOMEM);
+ }
+ RADIX_NODE_HEAD_LOCK_INIT(tbl->t_ip6);
+#endif
+ return (0);
+}
+
+static void
+wg_aip_destroy(struct wg_aip_table *tbl)
+{
+ RADIX_NODE_HEAD_DESTROY(tbl->t_ip);
+ free(tbl->t_ip, M_RTABLE);
+#ifdef INET6
+ RADIX_NODE_HEAD_DESTROY(tbl->t_ip6);
+ free(tbl->t_ip6, M_RTABLE);
+#endif
+}
+
+static void
+wg_aip_populate_aip4(struct wg_aip *aip, const struct in_addr *addr,
+ uint8_t mask)
+{
+ struct sockaddr_in *raddr, *rmask;
+ uint8_t *p;
+ unsigned int i;
+
+ raddr = (struct sockaddr_in *)&aip->r_addr;
+ rmask = (struct sockaddr_in *)&aip->r_mask;
+
+ raddr->sin_len = sizeof(*raddr);
+ raddr->sin_family = AF_INET;
+ raddr->sin_addr = *addr;
+
+ rmask->sin_len = sizeof(*rmask);
+ p = (uint8_t *)&rmask->sin_addr.s_addr;
+ for (i = 0; i < mask / NBBY; i++)
+ p[i] = 0xff;
+ if ((mask % NBBY) != 0)
+ p[i] = (0xff00 >> (mask % NBBY)) & 0xff;
+ raddr->sin_addr.s_addr &= rmask->sin_addr.s_addr;
+}
+
+static void
+wg_aip_populate_aip6(struct wg_aip *aip, const struct in6_addr *addr,
+ uint8_t mask)
+{
+ struct sockaddr_in6 *raddr, *rmask;
+
+ raddr = (struct sockaddr_in6 *)&aip->r_addr;
+ rmask = (struct sockaddr_in6 *)&aip->r_mask;
+
+ raddr->sin6_len = sizeof(*raddr);
+ raddr->sin6_family = AF_INET6;
+ raddr->sin6_addr = *addr;
+
+ rmask->sin6_len = sizeof(*rmask);
+ in6_prefixlen2mask(&rmask->sin6_addr, mask);
+ for (int i = 0; i < 4; ++i)
+ raddr->sin6_addr.__u6_addr.__u6_addr32[i] &= rmask->sin6_addr.__u6_addr.__u6_addr32[i];
+}
+
+/* wg_aip_take assumes that the caller guarantees the allowed-ip exists. */
+static void
+wg_aip_take(struct radix_node_head *root, struct wg_peer *peer,
+ struct wg_aip *route)
+{
+ struct radix_node *node;
+ struct wg_peer *ppeer;
+
+ RADIX_NODE_HEAD_LOCK_ASSERT(root);
+
+ node = root->rnh_lookup(&route->r_addr, &route->r_mask,
+ &root->rh);
+ MPASS(node != NULL);
+
+ route = (struct wg_aip *)node;
+ ppeer = route->r_peer;
+ if (ppeer != peer) {
+ route->r_peer = peer;
+
+ CK_LIST_REMOVE(route, r_entry);
+ CK_LIST_INSERT_HEAD(&peer->p_aips, route, r_entry);
+ }
+}
+
+static int
+wg_aip_add(struct wg_aip_table *tbl, struct wg_peer *peer,
+ const struct wg_allowedip *aip)
+{
+ struct radix_node *node;
+ struct radix_node_head *root;
+ struct wg_aip *route;
+ sa_family_t family;
+ bool needfree = false;
+
+ family = aip->family;
+ if (family != AF_INET && family != AF_INET6) {
+ return (EINVAL);
+ }
+
+ route = malloc(sizeof(*route), M_WG, M_WAITOK|M_ZERO);
+ switch (family) {
+ case AF_INET:
+ root = tbl->t_ip;
+
+ wg_aip_populate_aip4(route, &aip->ip4, aip->cidr);
+ break;
+ case AF_INET6:
+ root = tbl->t_ip6;
+
+ wg_aip_populate_aip6(route, &aip->ip6, aip->cidr);
+ break;
+ }
+
+ route->r_peer = peer;
+
+ RADIX_NODE_HEAD_LOCK(root);
+ node = root->rnh_addaddr(&route->r_addr, &route->r_mask, &root->rh,
+ route->r_nodes);
+ if (node == route->r_nodes) {
+ tbl->t_count++;
+ CK_LIST_INSERT_HEAD(&peer->p_aips, route, r_entry);
+ } else {
+ needfree = true;
+ wg_aip_take(root, peer, route);
+ }
+ RADIX_NODE_HEAD_UNLOCK(root);
+ if (needfree) {
+ free(route, M_WG);
+ }
+ return (0);
+}
+
+static struct wg_peer *
+wg_aip_lookup(struct wg_aip_table *tbl, struct mbuf *m,
+ enum route_direction dir)
+{
+ RADIX_NODE_HEAD_RLOCK_TRACKER;
+ struct ip *iphdr;
+ struct ip6_hdr *ip6hdr;
+ struct radix_node_head *root;
+ struct radix_node *node;
+ struct wg_peer *peer = NULL;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ void *addr;
+ int version;
+
+ NET_EPOCH_ASSERT();
+ iphdr = mtod(m, struct ip *);
+ version = iphdr->ip_v;
+
+ if (__predict_false(dir != IN && dir != OUT))
+ return NULL;
+
+ if (version == 4) {
+ root = tbl->t_ip;
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_len = sizeof(struct sockaddr_in);
+ if (dir == IN)
+ sin.sin_addr = iphdr->ip_src;
+ else
+ sin.sin_addr = iphdr->ip_dst;
+ addr = &sin;
+ } else if (version == 6) {
+ ip6hdr = mtod(m, struct ip6_hdr *);
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+
+ root = tbl->t_ip6;
+ if (dir == IN)
+ addr = &ip6hdr->ip6_src;
+ else
+ addr = &ip6hdr->ip6_dst;
+ memcpy(&sin6.sin6_addr, addr, sizeof(sin6.sin6_addr));
+ addr = &sin6;
+ } else {
+ return (NULL);
+ }
+ RADIX_NODE_HEAD_RLOCK(root);
+ if ((node = root->rnh_matchaddr(addr, &root->rh)) != NULL) {
+ peer = ((struct wg_aip *) node)->r_peer;
+ }
+ RADIX_NODE_HEAD_RUNLOCK(root);
+ return (peer);
+}
+
+struct peer_del_arg {
+ struct radix_node_head * pda_head;
+ struct wg_peer *pda_peer;
+ struct wg_aip_table *pda_tbl;
+};
+
+static int
+wg_peer_remove(struct radix_node *rn, void *arg)
+{
+ struct peer_del_arg *pda = arg;
+ struct wg_peer *peer = pda->pda_peer;
+ struct radix_node_head * rnh = pda->pda_head;
+ struct wg_aip_table *tbl = pda->pda_tbl;
+ struct wg_aip *route = (struct wg_aip *)rn;
+ struct radix_node *x;
+
+ if (route->r_peer != peer)
+ return (0);
+ x = (struct radix_node *)rnh->rnh_deladdr(&route->r_addr,
+ &route->r_mask, &rnh->rh);
+ if (x != NULL) {
+ tbl->t_count--;
+ CK_LIST_REMOVE(route, r_entry);
+ free(route, M_WG);
+ }
+ return (0);
+}
+
+static void
+wg_peer_remove_all(struct wg_softc *sc)
+{
+ struct wg_peer *peer, *tpeer;
+
+ sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+ CK_LIST_FOREACH_SAFE(peer, &sc->sc_hashtable.h_peers_list,
+ p_entry, tpeer) {
+ wg_hashtable_peer_remove(&sc->sc_hashtable, peer);
+ wg_peer_destroy(peer);
+ }
+}
+
+static int
+wg_aip_delete(struct wg_aip_table *tbl, struct wg_peer *peer)
+{
+ struct peer_del_arg pda;
+
+ pda.pda_peer = peer;
+ pda.pda_tbl = tbl;
+ RADIX_NODE_HEAD_LOCK(tbl->t_ip);
+ pda.pda_head = tbl->t_ip;
+ rn_walktree(&tbl->t_ip->rh, wg_peer_remove, &pda);
+ RADIX_NODE_HEAD_UNLOCK(tbl->t_ip);
+
+ RADIX_NODE_HEAD_LOCK(tbl->t_ip6);
+ pda.pda_head = tbl->t_ip6;
+ rn_walktree(&tbl->t_ip6->rh, wg_peer_remove, &pda);
+ RADIX_NODE_HEAD_UNLOCK(tbl->t_ip6);
+ return (0);
+}
+
+static int
+wg_socket_init(struct wg_softc *sc, in_port_t port)
+{
+ struct thread *td;
+ struct ucred *cred;
+ struct socket *so4, *so6;
+ int rc;
+
+ sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+ td = curthread;
+ if (sc->sc_ucred == NULL)
+ return (EBUSY);
+ cred = crhold(sc->sc_ucred);
+
+ /*
+ * For socket creation, we use the creds of the thread that created the
+ * tunnel rather than the current thread to maintain the semantics that
+ * WireGuard has on Linux with network namespaces -- that the sockets
+ * are created in their home vnet so that they can be configured and
+ * functionally attached to a foreign vnet as the jail's only interface
+ * to the network.
+ */
+ rc = socreate(AF_INET, &so4, SOCK_DGRAM, IPPROTO_UDP, cred, td);
+ if (rc)
+ goto out;
+
+ rc = udp_set_kernel_tunneling(so4, wg_input, NULL, sc);
+ /*
+ * udp_set_kernel_tunneling can only fail if there is already a tunneling function set.
+ * This should never happen with a new socket.
+ */
+ MPASS(rc == 0);
+
+ rc = socreate(AF_INET6, &so6, SOCK_DGRAM, IPPROTO_UDP, cred, td);
+ if (rc) {
+ SOCK_LOCK(so4);
+ sofree(so4);
+ goto out;
+ }
+ rc = udp_set_kernel_tunneling(so6, wg_input, NULL, sc);
+ MPASS(rc == 0);
+
+ so4->so_user_cookie = so6->so_user_cookie = sc->sc_socket.so_user_cookie;
+
+ rc = wg_socket_bind(so4, so6, &port);
+ if (rc == 0) {
+ sc->sc_socket.so_port = port;
+ wg_socket_set(sc, so4, so6);
+ }
+out:
+ crfree(cred);
+ return (rc);
+}
+
+static void wg_socket_set_cookie(struct wg_softc *sc, uint32_t user_cookie)
+{
+ struct wg_socket *so = &sc->sc_socket;
+
+ sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+ so->so_user_cookie = user_cookie;
+ if (so->so_so4)
+ so->so_so4->so_user_cookie = user_cookie;
+ if (so->so_so6)
+ so->so_so6->so_user_cookie = user_cookie;
+}
+
+static void
+wg_socket_uninit(struct wg_softc *sc)
+{
+ wg_socket_set(sc, NULL, NULL);
+}
+
+static void
+wg_socket_set(struct wg_softc *sc, struct socket *new_so4, struct socket *new_so6)
+{
+ struct wg_socket *so = &sc->sc_socket;
+ struct socket *so4, *so6;
+
+ sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+ so4 = atomic_load_ptr(&so->so_so4);
+ so6 = atomic_load_ptr(&so->so_so6);
+ atomic_store_ptr(&so->so_so4, new_so4);
+ atomic_store_ptr(&so->so_so6, new_so6);
+
+ if (!so4 && !so6)
+ return;
+ NET_EPOCH_WAIT();
+ if (so4)
+ soclose(so4);
+ if (so6)
+ soclose(so6);
+}
+
+union wg_sockaddr {
+ struct sockaddr sa;
+ struct sockaddr_in in4;
+ struct sockaddr_in6 in6;
+};
+
+static int
+wg_socket_bind(struct socket *so4, struct socket *so6, in_port_t *requested_port)
+{
+ int rc;
+ struct thread *td;
+ union wg_sockaddr laddr;
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+ in_port_t port = *requested_port;
+
+ td = curthread;
+ bzero(&laddr, sizeof(laddr));
+ sin = &laddr.in4;
+ sin->sin_len = sizeof(laddr.in4);
+ sin->sin_family = AF_INET;
+ sin->sin_port = htons(port);
+ sin->sin_addr = (struct in_addr) { 0 };
+
+ if ((rc = sobind(so4, &laddr.sa, td)) != 0)
+ return (rc);
+
+ if (port == 0) {
+ rc = sogetsockaddr(so4, (struct sockaddr **)&sin);
+ if (rc != 0)
+ return (rc);
+ port = ntohs(sin->sin_port);
+ free(sin, M_SONAME);
+ }
+
+ sin6 = &laddr.in6;
+ sin6->sin6_len = sizeof(laddr.in6);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = htons(port);
+ sin6->sin6_addr = (struct in6_addr) { .s6_addr = { 0 } };
+ rc = sobind(so6, &laddr.sa, td);
+ if (rc != 0)
+ return (rc);
+ *requested_port = port;
+ return (0);
+}
+
+static int
+wg_send(struct wg_softc *sc, struct wg_endpoint *e, struct mbuf *m)
+{
+ struct epoch_tracker et;
+ struct sockaddr *sa;
+ struct wg_socket *so = &sc->sc_socket;
+ struct socket *so4, *so6;
+ struct mbuf *control = NULL;
+ int ret = 0;
+ size_t len = m->m_pkthdr.len;
+
+ /* Get local control address before locking */
+ if (e->e_remote.r_sa.sa_family == AF_INET) {
+ if (e->e_local.l_in.s_addr != INADDR_ANY)
+ control = sbcreatecontrol((caddr_t)&e->e_local.l_in,
+ sizeof(struct in_addr), IP_SENDSRCADDR,
+ IPPROTO_IP);
+ } else if (e->e_remote.r_sa.sa_family == AF_INET6) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&e->e_local.l_in6))
+ control = sbcreatecontrol((caddr_t)&e->e_local.l_pktinfo6,
+ sizeof(struct in6_pktinfo), IPV6_PKTINFO,
+ IPPROTO_IPV6);
+ } else {
+ m_freem(m);
+ return (EAFNOSUPPORT);
+ }
+
+ /* Get remote address */
+ sa = &e->e_remote.r_sa;
+
+ NET_EPOCH_ENTER(et);
+ so4 = atomic_load_ptr(&so->so_so4);
+ so6 = atomic_load_ptr(&so->so_so6);
+ if (e->e_remote.r_sa.sa_family == AF_INET && so4 != NULL)
+ ret = sosend(so4, sa, NULL, m, control, 0, curthread);
+ else if (e->e_remote.r_sa.sa_family == AF_INET6 && so6 != NULL)
+ ret = sosend(so6, sa, NULL, m, control, 0, curthread);
+ else {
+ ret = ENOTCONN;
+ m_freem(control);
+ m_freem(m);
+ }
+ NET_EPOCH_EXIT(et);
+ if (ret == 0) {
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len);
+ }
+ return (ret);
+}
+
+static void
+wg_send_buf(struct wg_softc *sc, struct wg_endpoint *e, uint8_t *buf,
+ size_t len)
+{
+ struct mbuf *m;
+ int ret = 0;
+
+retry:
+ m = m_gethdr(M_WAITOK, MT_DATA);
+ m->m_len = 0;
+ m_copyback(m, 0, len, buf);
+
+ if (ret == 0) {
+ ret = wg_send(sc, e, m);
+ /* Retry if we couldn't bind to e->e_local */
+ if (ret == EADDRNOTAVAIL) {
+ bzero(&e->e_local, sizeof(e->e_local));
+ goto retry;
+ }
+ } else {
+ ret = wg_send(sc, e, m);
+ }
+ if (ret)
+ DPRINTF(sc, "Unable to send packet: %d\n", ret);
+}
+
+/* TODO Tag */
+static struct wg_tag *
+wg_tag_get(struct mbuf *m)
+{
+ struct m_tag *tag;
+
+ tag = m_tag_find(m, MTAG_WIREGUARD, NULL);
+ if (tag == NULL) {
+ tag = m_tag_get(MTAG_WIREGUARD, sizeof(struct wg_tag), M_NOWAIT|M_ZERO);
+ m_tag_prepend(m, tag);
+ MPASS(!SLIST_EMPTY(&m->m_pkthdr.tags));
+ MPASS(m_tag_locate(m, MTAG_ABI_COMPAT, MTAG_WIREGUARD, NULL) == tag);
+ }
+ return (struct wg_tag *)tag;
+}
+
+static struct wg_endpoint *
+wg_mbuf_endpoint_get(struct mbuf *m)
+{
+ struct wg_tag *hdr;
+
+ if ((hdr = wg_tag_get(m)) == NULL)
+ return (NULL);
+
+ return (&hdr->t_endpoint);
+}
+
+/* Timers */
+static void
+wg_timers_init(struct wg_timers *t)
+{
+ bzero(t, sizeof(*t));
+
+ t->t_disabled = 1;
+ rw_init(&t->t_lock, "wg peer timers");
+ callout_init(&t->t_retry_handshake, true);
+ callout_init(&t->t_send_keepalive, true);
+ callout_init(&t->t_new_handshake, true);
+ callout_init(&t->t_zero_key_material, true);
+ callout_init(&t->t_persistent_keepalive, true);
+}
+
+static void
+wg_timers_enable(struct wg_timers *t)
+{
+ rw_wlock(&t->t_lock);
+ t->t_disabled = 0;
+ rw_wunlock(&t->t_lock);
+ wg_timers_run_persistent_keepalive(t);
+}
+
+static void
+wg_timers_disable(struct wg_timers *t)
+{
+ rw_wlock(&t->t_lock);
+ t->t_disabled = 1;
+ t->t_need_another_keepalive = 0;
+ rw_wunlock(&t->t_lock);
+
+ callout_stop(&t->t_retry_handshake);
+ callout_stop(&t->t_send_keepalive);
+ callout_stop(&t->t_new_handshake);
+ callout_stop(&t->t_zero_key_material);
+ callout_stop(&t->t_persistent_keepalive);
+}
+
+static void
+wg_timers_set_persistent_keepalive(struct wg_timers *t, uint16_t interval)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled) {
+ t->t_persistent_keepalive_interval = interval;
+ wg_timers_run_persistent_keepalive(t);
+ }
+ rw_runlock(&t->t_lock);
+}
+
+static void
+wg_timers_get_last_handshake(struct wg_timers *t, struct timespec *time)
+{
+ rw_rlock(&t->t_lock);
+ time->tv_sec = t->t_handshake_complete.tv_sec;
+ time->tv_nsec = t->t_handshake_complete.tv_nsec;
+ rw_runlock(&t->t_lock);
+}
+
+static int
+wg_timers_expired_handshake_last_sent(struct wg_timers *t)
+{
+ struct timespec uptime;
+ struct timespec expire = { .tv_sec = REKEY_TIMEOUT, .tv_nsec = 0 };
+
+ getnanouptime(&uptime);
+ timespecadd(&t->t_handshake_last_sent, &expire, &expire);
+ return timespeccmp(&uptime, &expire, >) ? ETIMEDOUT : 0;
+}
+
+static int
+wg_timers_check_handshake_last_sent(struct wg_timers *t)
+{
+ int ret;
+
+ rw_wlock(&t->t_lock);
+ if ((ret = wg_timers_expired_handshake_last_sent(t)) == ETIMEDOUT)
+ getnanouptime(&t->t_handshake_last_sent);
+ rw_wunlock(&t->t_lock);
+ return (ret);
+}
+
+/* Should be called after an authenticated data packet is sent. */
+static void
+wg_timers_event_data_sent(struct wg_timers *t)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled && !callout_pending(&t->t_new_handshake))
+ callout_reset(&t->t_new_handshake, MSEC_2_TICKS(
+ NEW_HANDSHAKE_TIMEOUT * 1000 +
+ arc4random_uniform(REKEY_TIMEOUT_JITTER)),
+ (timeout_t *)wg_timers_run_new_handshake, t);
+ rw_runlock(&t->t_lock);
+}
+
+/* Should be called after an authenticated data packet is received. */
+static void
+wg_timers_event_data_received(struct wg_timers *t)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled) {
+ if (!callout_pending(&t->t_send_keepalive)) {
+ callout_reset(&t->t_send_keepalive,
+ MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000),
+ (timeout_t *)wg_timers_run_send_keepalive, t);
+ } else {
+ t->t_need_another_keepalive = 1;
+ }
+ }
+ rw_runlock(&t->t_lock);
+}
+
+/*
+ * Should be called after any type of authenticated packet is sent, whether
+ * keepalive, data, or handshake.
+ */
+static void
+wg_timers_event_any_authenticated_packet_sent(struct wg_timers *t)
+{
+ callout_stop(&t->t_send_keepalive);
+}
+
+/*
+ * Should be called after any type of authenticated packet is received, whether
+ * keepalive, data, or handshake.
+ */
+static void
+wg_timers_event_any_authenticated_packet_received(struct wg_timers *t)
+{
+ callout_stop(&t->t_new_handshake);
+}
+
+/*
+ * Should be called before a packet with authentication, whether
+ * keepalive, data, or handshake is sent, or after one is received.
+ */
+static void
+wg_timers_event_any_authenticated_packet_traversal(struct wg_timers *t)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled && t->t_persistent_keepalive_interval > 0)
+ callout_reset(&t->t_persistent_keepalive,
+ MSEC_2_TICKS(t->t_persistent_keepalive_interval * 1000),
+ (timeout_t *)wg_timers_run_persistent_keepalive, t);
+ rw_runlock(&t->t_lock);
+}
+
+/* Should be called after a handshake initiation message is sent. */
+static void
+wg_timers_event_handshake_initiated(struct wg_timers *t)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled)
+ callout_reset(&t->t_retry_handshake, MSEC_2_TICKS(
+ REKEY_TIMEOUT * 1000 +
+ arc4random_uniform(REKEY_TIMEOUT_JITTER)),
+ (timeout_t *)wg_timers_run_retry_handshake, t);
+ rw_runlock(&t->t_lock);
+}
+
+static void
+wg_timers_event_handshake_responded(struct wg_timers *t)
+{
+ rw_wlock(&t->t_lock);
+ getnanouptime(&t->t_handshake_last_sent);
+ rw_wunlock(&t->t_lock);
+}
+
+/*
+ * Should be called after a handshake response message is received and processed
+ * or when getting key confirmation via the first data message.
+ */
+static void
+wg_timers_event_handshake_complete(struct wg_timers *t)
+{
+ rw_wlock(&t->t_lock);
+ if (!t->t_disabled) {
+ callout_stop(&t->t_retry_handshake);
+ t->t_handshake_retries = 0;
+ getnanotime(&t->t_handshake_complete);
+ wg_timers_run_send_keepalive(t);
+ }
+ rw_wunlock(&t->t_lock);
+}
+
+/*
+ * Should be called after an ephemeral key is created, which is before sending a
+ * handshake response or after receiving a handshake response.
+ */
+static void
+wg_timers_event_session_derived(struct wg_timers *t)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled) {
+ callout_reset(&t->t_zero_key_material,
+ MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000),
+ (timeout_t *)wg_timers_run_zero_key_material, t);
+ }
+ rw_runlock(&t->t_lock);
+}
+
+static void
+wg_timers_event_want_initiation(struct wg_timers *t)
+{
+ rw_rlock(&t->t_lock);
+ if (!t->t_disabled)
+ wg_timers_run_send_initiation(t, 0);
+ rw_runlock(&t->t_lock);
+}
+
+static void
+wg_timers_event_reset_handshake_last_sent(struct wg_timers *t)
+{
+ rw_wlock(&t->t_lock);
+ t->t_handshake_last_sent.tv_sec -= (REKEY_TIMEOUT + 1);
+ rw_wunlock(&t->t_lock);
+}
+
+static void
+wg_timers_run_send_initiation(struct wg_timers *t, int is_retry)
+{
+ struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers);
+ if (!is_retry)
+ t->t_handshake_retries = 0;
+ if (wg_timers_expired_handshake_last_sent(t) == ETIMEDOUT)
+ GROUPTASK_ENQUEUE(&peer->p_send_initiation);
+}
+
+static void
+wg_timers_run_retry_handshake(struct wg_timers *t)
+{
+ struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers);
+
+ rw_wlock(&t->t_lock);
+ if (t->t_handshake_retries <= MAX_TIMER_HANDSHAKES) {
+ t->t_handshake_retries++;
+ rw_wunlock(&t->t_lock);
+
+ DPRINTF(peer->p_sc, "Handshake for peer %llu did not complete "
+ "after %d seconds, retrying (try %d)\n",
+ (unsigned long long)peer->p_id,
+ REKEY_TIMEOUT, t->t_handshake_retries + 1);
+ wg_peer_clear_src(peer);
+ wg_timers_run_send_initiation(t, 1);
+ } else {
+ rw_wunlock(&t->t_lock);
+
+ DPRINTF(peer->p_sc, "Handshake for peer %llu did not complete "
+ "after %d retries, giving up\n",
+ (unsigned long long) peer->p_id, MAX_TIMER_HANDSHAKES + 2);
+
+ callout_stop(&t->t_send_keepalive);
+ wg_queue_purge(&peer->p_stage_queue);
+ if (!callout_pending(&t->t_zero_key_material))
+ callout_reset(&t->t_zero_key_material,
+ MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000),
+ (timeout_t *)wg_timers_run_zero_key_material, t);
+ }
+}
+
+static void
+wg_timers_run_send_keepalive(struct wg_timers *t)
+{
+ struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers);
+
+ GROUPTASK_ENQUEUE(&peer->p_send_keepalive);
+ if (t->t_need_another_keepalive) {
+ t->t_need_another_keepalive = 0;
+ callout_reset(&t->t_send_keepalive,
+ MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000),
+ (timeout_t *)wg_timers_run_send_keepalive, t);
+ }
+}
+
+static void
+wg_timers_run_new_handshake(struct wg_timers *t)
+{
+ struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers);
+
+ DPRINTF(peer->p_sc, "Retrying handshake with peer %llu because we "
+ "stopped hearing back after %d seconds\n",
+ (unsigned long long)peer->p_id, NEW_HANDSHAKE_TIMEOUT);
+ wg_peer_clear_src(peer);
+
+ wg_timers_run_send_initiation(t, 0);
+}
+
+static void
+wg_timers_run_zero_key_material(struct wg_timers *t)
+{
+ struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers);
+
+ DPRINTF(peer->p_sc, "Zeroing out all keys for peer %llu, since we "
+ "haven't received a new one in %d seconds\n",
+ (unsigned long long)peer->p_id, REJECT_AFTER_TIME * 3);
+ GROUPTASK_ENQUEUE(&peer->p_clear_secrets);
+}
+
+static void
+wg_timers_run_persistent_keepalive(struct wg_timers *t)
+{
+ struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers);
+
+ if (t->t_persistent_keepalive_interval != 0)
+ GROUPTASK_ENQUEUE(&peer->p_send_keepalive);
+}
+
+/* TODO Handshake */
+static void
+wg_peer_send_buf(struct wg_peer *peer, uint8_t *buf, size_t len)
+{
+ struct wg_endpoint endpoint;
+
+ counter_u64_add(peer->p_tx_bytes, len);
+ wg_timers_event_any_authenticated_packet_traversal(&peer->p_timers);
+ wg_timers_event_any_authenticated_packet_sent(&peer->p_timers);
+ wg_peer_get_endpoint(peer, &endpoint);
+ wg_send_buf(peer->p_sc, &endpoint, buf, len);
+}
+
+static void
+wg_send_initiation(struct wg_peer *peer)
+{
+ struct wg_pkt_initiation pkt;
+ struct epoch_tracker et;
+
+ if (wg_timers_check_handshake_last_sent(&peer->p_timers) != ETIMEDOUT)
+ return;
+ DPRINTF(peer->p_sc, "Sending handshake initiation to peer %llu\n",
+ (unsigned long long)peer->p_id);
+
+ NET_EPOCH_ENTER(et);
+ if (noise_create_initiation(&peer->p_remote, &pkt.s_idx, pkt.ue,
+ pkt.es, pkt.ets) != 0)
+ goto out;
+ pkt.t = WG_PKT_INITIATION;
+ cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt,
+ sizeof(pkt)-sizeof(pkt.m));
+ wg_peer_send_buf(peer, (uint8_t *)&pkt, sizeof(pkt));
+ wg_timers_event_handshake_initiated(&peer->p_timers);
+out:
+ NET_EPOCH_EXIT(et);
+}
+
+static void
+wg_send_response(struct wg_peer *peer)
+{
+ struct wg_pkt_response pkt;
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
+
+ DPRINTF(peer->p_sc, "Sending handshake response to peer %llu\n",
+ (unsigned long long)peer->p_id);
+
+ if (noise_create_response(&peer->p_remote, &pkt.s_idx, &pkt.r_idx,
+ pkt.ue, pkt.en) != 0)
+ goto out;
+ if (noise_remote_begin_session(&peer->p_remote) != 0)
+ goto out;
+
+ wg_timers_event_session_derived(&peer->p_timers);
+ pkt.t = WG_PKT_RESPONSE;
+ cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt,
+ sizeof(pkt)-sizeof(pkt.m));
+ wg_timers_event_handshake_responded(&peer->p_timers);
+ wg_peer_send_buf(peer, (uint8_t*)&pkt, sizeof(pkt));
+out:
+ NET_EPOCH_EXIT(et);
+}
+
+static void
+wg_send_cookie(struct wg_softc *sc, struct cookie_macs *cm, uint32_t idx,
+ struct mbuf *m)
+{
+ struct wg_pkt_cookie pkt;
+ struct wg_endpoint *e;
+
+ DPRINTF(sc, "Sending cookie response for denied handshake message\n");
+
+ pkt.t = WG_PKT_COOKIE;
+ pkt.r_idx = idx;
+
+ e = wg_mbuf_endpoint_get(m);
+ cookie_checker_create_payload(&sc->sc_cookie, cm, pkt.nonce,
+ pkt.ec, &e->e_remote.r_sa);
+ wg_send_buf(sc, e, (uint8_t *)&pkt, sizeof(pkt));
+}
+
+static void
+wg_send_keepalive(struct wg_peer *peer)
+{
+ struct mbuf *m = NULL;
+ struct wg_tag *t;
+ struct epoch_tracker et;
+
+ if (wg_queue_len(&peer->p_stage_queue) != 0) {
+ NET_EPOCH_ENTER(et);
+ goto send;
+ }
+ if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
+ return;
+ if ((t = wg_tag_get(m)) == NULL) {
+ m_freem(m);
+ return;
+ }
+ t->t_peer = peer;
+ t->t_mbuf = NULL;
+ t->t_done = 0;
+ t->t_mtu = 0; /* MTU == 0 OK for keepalive */
+
+ NET_EPOCH_ENTER(et);
+ wg_queue_stage(peer, m);
+send:
+ wg_queue_out(peer);
+ NET_EPOCH_EXIT(et);
+}
+
+static int
+wg_cookie_validate_packet(struct cookie_checker *checker, struct mbuf *m,
+ int under_load)
+{
+ struct wg_pkt_initiation *init;
+ struct wg_pkt_response *resp;
+ struct cookie_macs *macs;
+ struct wg_endpoint *e;
+ int type, size;
+ void *data;
+
+ type = *mtod(m, uint32_t *);
+ data = m->m_data;
+ e = wg_mbuf_endpoint_get(m);
+ if (type == WG_PKT_INITIATION) {
+ init = mtod(m, struct wg_pkt_initiation *);
+ macs = &init->m;
+ size = sizeof(*init) - sizeof(*macs);
+ } else if (type == WG_PKT_RESPONSE) {
+ resp = mtod(m, struct wg_pkt_response *);
+ macs = &resp->m;
+ size = sizeof(*resp) - sizeof(*macs);
+ } else
+ return 0;
+
+ return (cookie_checker_validate_macs(checker, macs, data, size,
+ under_load, &e->e_remote.r_sa));
+}
+
+
+static void
+wg_handshake(struct wg_softc *sc, struct mbuf *m)
+{
+ struct wg_pkt_initiation *init;
+ struct wg_pkt_response *resp;
+ struct noise_remote *remote;
+ struct wg_pkt_cookie *cook;
+ struct wg_peer *peer;
+ struct wg_tag *t;
+
+ /* This is global, so that our load calculation applies to the whole
+ * system. We don't care about races with it at all.
+ */
+ static struct timeval wg_last_underload;
+ static const struct timeval underload_interval = { UNDERLOAD_TIMEOUT, 0 };
+ bool packet_needs_cookie = false;
+ int underload, res;
+
+ underload = mbufq_len(&sc->sc_handshake_queue) >=
+ MAX_QUEUED_HANDSHAKES / 8;
+ if (underload)
+ getmicrouptime(&wg_last_underload);
+ else if (wg_last_underload.tv_sec != 0) {
+ if (!ratecheck(&wg_last_underload, &underload_interval))
+ underload = 1;
+ else
+ bzero(&wg_last_underload, sizeof(wg_last_underload));
+ }
+
+ res = wg_cookie_validate_packet(&sc->sc_cookie, m, underload);
+
+ if (res && res != EAGAIN) {
+ printf("validate_packet got %d\n", res);
+ goto free;
+ }
+ if (res == EINVAL) {
+ DPRINTF(sc, "Invalid initiation MAC\n");
+ goto free;
+ } else if (res == ECONNREFUSED) {
+ DPRINTF(sc, "Handshake ratelimited\n");
+ goto free;
+ } else if (res == EAGAIN) {
+ packet_needs_cookie = true;
+ } else if (res != 0) {
+ DPRINTF(sc, "Unexpected handshake ratelimit response: %d\n", res);
+ goto free;
+ }
+
+ t = wg_tag_get(m);
+ switch (*mtod(m, uint32_t *)) {
+ case WG_PKT_INITIATION:
+ init = mtod(m, struct wg_pkt_initiation *);
+
+ if (packet_needs_cookie) {
+ wg_send_cookie(sc, &init->m, init->s_idx, m);
+ goto free;
+ }
+ if (noise_consume_initiation(&sc->sc_local, &remote,
+ init->s_idx, init->ue, init->es, init->ets) != 0) {
+ DPRINTF(sc, "Invalid handshake initiation");
+ goto free;
+ }
+
+ peer = __containerof(remote, struct wg_peer, p_remote);
+ DPRINTF(sc, "Receiving handshake initiation from peer %llu\n",
+ (unsigned long long)peer->p_id);
+ counter_u64_add(peer->p_rx_bytes, sizeof(*init));
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, sizeof(*init));
+ wg_peer_set_endpoint_from_tag(peer, t);
+ wg_send_response(peer);
+ break;
+ case WG_PKT_RESPONSE:
+ resp = mtod(m, struct wg_pkt_response *);
+
+ if (packet_needs_cookie) {
+ wg_send_cookie(sc, &resp->m, resp->s_idx, m);
+ goto free;
+ }
+
+ if ((remote = wg_index_get(sc, resp->r_idx)) == NULL) {
+ DPRINTF(sc, "Unknown handshake response\n");
+ goto free;
+ }
+ peer = __containerof(remote, struct wg_peer, p_remote);
+ if (noise_consume_response(remote, resp->s_idx, resp->r_idx,
+ resp->ue, resp->en) != 0) {
+ DPRINTF(sc, "Invalid handshake response\n");
+ goto free;
+ }
+
+ DPRINTF(sc, "Receiving handshake response from peer %llu\n",
+ (unsigned long long)peer->p_id);
+ counter_u64_add(peer->p_rx_bytes, sizeof(*resp));
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, sizeof(*resp));
+ wg_peer_set_endpoint_from_tag(peer, t);
+ if (noise_remote_begin_session(&peer->p_remote) == 0) {
+ wg_timers_event_session_derived(&peer->p_timers);
+ wg_timers_event_handshake_complete(&peer->p_timers);
+ }
+ break;
+ case WG_PKT_COOKIE:
+ cook = mtod(m, struct wg_pkt_cookie *);
+
+ if ((remote = wg_index_get(sc, cook->r_idx)) == NULL) {
+ DPRINTF(sc, "Unknown cookie index\n");
+ goto free;
+ }
+
+ peer = __containerof(remote, struct wg_peer, p_remote);
+
+ if (cookie_maker_consume_payload(&peer->p_cookie,
+ cook->nonce, cook->ec) != 0) {
+ DPRINTF(sc, "Could not decrypt cookie response\n");
+ goto free;
+ }
+
+ DPRINTF(sc, "Receiving cookie response\n");
+ goto free;
+ default:
+ goto free;
+ }
+ MPASS(peer != NULL);
+ wg_timers_event_any_authenticated_packet_received(&peer->p_timers);
+ wg_timers_event_any_authenticated_packet_traversal(&peer->p_timers);
+
+free:
+ m_freem(m);
+}
+
+static void
+wg_softc_handshake_receive(struct wg_softc *sc)
+{
+ struct mbuf *m;
+
+ while ((m = mbufq_dequeue(&sc->sc_handshake_queue)) != NULL)
+ wg_handshake(sc, m);
+}
+
+/* TODO Encrypt */
+static void
+wg_encap(struct wg_softc *sc, struct mbuf *m)
+{
+ struct wg_pkt_data *data;
+ size_t padding_len, plaintext_len, out_len;
+ struct mbuf *mc;
+ struct wg_peer *peer;
+ struct wg_tag *t;
+ uint64_t nonce;
+ int res, allocation_order;
+
+ NET_EPOCH_ASSERT();
+ t = wg_tag_get(m);
+ peer = t->t_peer;
+
+ plaintext_len = MIN(WG_PKT_WITH_PADDING(m->m_pkthdr.len), t->t_mtu);
+ padding_len = plaintext_len - m->m_pkthdr.len;
+ out_len = sizeof(struct wg_pkt_data) + plaintext_len + NOISE_AUTHTAG_LEN;
+
+ if (out_len <= MCLBYTES)
+ allocation_order = MCLBYTES;
+ else if (out_len <= MJUMPAGESIZE)
+ allocation_order = MJUMPAGESIZE;
+ else if (out_len <= MJUM9BYTES)
+ allocation_order = MJUM9BYTES;
+ else if (out_len <= MJUM16BYTES)
+ allocation_order = MJUM16BYTES;
+ else
+ goto error;
+
+ if ((mc = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, allocation_order)) == NULL)
+ goto error;
+
+ data = mtod(mc, struct wg_pkt_data *);
+ m_copydata(m, 0, m->m_pkthdr.len, data->buf);
+ bzero(data->buf + m->m_pkthdr.len, padding_len);
+
+ data->t = WG_PKT_DATA;
+
+ res = noise_remote_encrypt(&peer->p_remote, &data->r_idx, &nonce,
+ data->buf, plaintext_len);
+ nonce = htole64(nonce); /* Wire format is little endian. */
+ memcpy(data->nonce, &nonce, sizeof(data->nonce));
+
+ if (__predict_false(res)) {
+ if (res == EINVAL) {
+ wg_timers_event_want_initiation(&peer->p_timers);
+ m_freem(mc);
+ goto error;
+ } else if (res == ESTALE) {
+ wg_timers_event_want_initiation(&peer->p_timers);
+ } else {
+ m_freem(mc);
+ goto error;
+ }
+ }
+
+ /* A packet with length 0 is a keepalive packet */
+ if (m->m_pkthdr.len == 0)
+ DPRINTF(sc, "Sending keepalive packet to peer %llu\n",
+ (unsigned long long)peer->p_id);
+ /*
+ * Set the correct output value here since it will be copied
+ * when we move the pkthdr in send.
+ */
+ mc->m_len = mc->m_pkthdr.len = out_len;
+ mc->m_flags &= ~(M_MCAST | M_BCAST);
+
+ t->t_mbuf = mc;
+ error:
+ /* XXX membar ? */
+ t->t_done = 1;
+ GROUPTASK_ENQUEUE(&peer->p_send);
+}
+
+static void
+wg_decap(struct wg_softc *sc, struct mbuf *m)
+{
+ struct wg_pkt_data *data;
+ struct wg_peer *peer, *routed_peer;
+ struct wg_tag *t;
+ size_t plaintext_len;
+ uint8_t version;
+ uint64_t nonce;
+ int res;
+
+ NET_EPOCH_ASSERT();
+ data = mtod(m, struct wg_pkt_data *);
+ plaintext_len = m->m_pkthdr.len - sizeof(struct wg_pkt_data);
+
+ t = wg_tag_get(m);
+ peer = t->t_peer;
+
+ memcpy(&nonce, data->nonce, sizeof(nonce));
+ nonce = le64toh(nonce); /* Wire format is little endian. */
+
+ res = noise_remote_decrypt(&peer->p_remote, data->r_idx, nonce,
+ data->buf, plaintext_len);
+
+ if (__predict_false(res)) {
+ if (res == EINVAL) {
+ goto error;
+ } else if (res == ECONNRESET) {
+ wg_timers_event_handshake_complete(&peer->p_timers);
+ } else if (res == ESTALE) {
+ wg_timers_event_want_initiation(&peer->p_timers);
+ } else {
+ panic("unexpected response: %d\n", res);
+ }
+ }
+ wg_peer_set_endpoint_from_tag(peer, t);
+
+ /* Remove the data header, and crypto mac tail from the packet */
+ m_adj(m, sizeof(struct wg_pkt_data));
+ m_adj(m, -NOISE_AUTHTAG_LEN);
+
+ /* A packet with length 0 is a keepalive packet */
+ if (m->m_pkthdr.len == 0) {
+ DPRINTF(peer->p_sc, "Receiving keepalive packet from peer "
+ "%llu\n", (unsigned long long)peer->p_id);
+ goto done;
+ }
+
+ version = mtod(m, struct ip *)->ip_v;
+ if (!((version == 4 && m->m_pkthdr.len >= sizeof(struct ip)) ||
+ (version == 6 && m->m_pkthdr.len >= sizeof(struct ip6_hdr)))) {
+ DPRINTF(peer->p_sc, "Packet is neither ipv4 nor ipv6 from peer "
+ "%llu\n", (unsigned long long)peer->p_id);
+ goto error;
+ }
+
+ routed_peer = wg_aip_lookup(&peer->p_sc->sc_aips, m, IN);
+ if (routed_peer != peer) {
+ DPRINTF(peer->p_sc, "Packet has unallowed src IP from peer "
+ "%llu\n", (unsigned long long)peer->p_id);
+ goto error;
+ }
+
+done:
+ t->t_mbuf = m;
+error:
+ t->t_done = 1;
+ GROUPTASK_ENQUEUE(&peer->p_recv);
+}
+
+static void
+wg_softc_decrypt(struct wg_softc *sc)
+{
+ struct epoch_tracker et;
+ struct mbuf *m;
+
+ NET_EPOCH_ENTER(et);
+ while ((m = buf_ring_dequeue_mc(sc->sc_decap_ring)) != NULL)
+ wg_decap(sc, m);
+ NET_EPOCH_EXIT(et);
+}
+
+static void
+wg_softc_encrypt(struct wg_softc *sc)
+{
+ struct mbuf *m;
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
+ while ((m = buf_ring_dequeue_mc(sc->sc_encap_ring)) != NULL)
+ wg_encap(sc, m);
+ NET_EPOCH_EXIT(et);
+}
+
+static void
+wg_encrypt_dispatch(struct wg_softc *sc)
+{
+ for (int i = 0; i < mp_ncpus; i++) {
+ if (sc->sc_encrypt[i].gt_task.ta_flags & TASK_ENQUEUED)
+ continue;
+ GROUPTASK_ENQUEUE(&sc->sc_encrypt[i]);
+ }
+}
+
+static void
+wg_decrypt_dispatch(struct wg_softc *sc)
+{
+ for (int i = 0; i < mp_ncpus; i++) {
+ if (sc->sc_decrypt[i].gt_task.ta_flags & TASK_ENQUEUED)
+ continue;
+ GROUPTASK_ENQUEUE(&sc->sc_decrypt[i]);
+ }
+}
+
+static void
+wg_deliver_out(struct wg_peer *peer)
+{
+ struct epoch_tracker et;
+ struct wg_tag *t;
+ struct mbuf *m;
+ struct wg_endpoint endpoint;
+ size_t len;
+ int ret;
+
+ NET_EPOCH_ENTER(et);
+ if (peer->p_sc->sc_ifp->if_link_state == LINK_STATE_DOWN)
+ goto done;
+
+ wg_peer_get_endpoint(peer, &endpoint);
+
+ while ((m = wg_queue_dequeue(&peer->p_encap_queue, &t)) != NULL) {
+ /* t_mbuf will contain the encrypted packet */
+ if (t->t_mbuf == NULL) {
+ if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OERRORS, 1);
+ m_freem(m);
+ continue;
+ }
+ len = t->t_mbuf->m_pkthdr.len;
+ ret = wg_send(peer->p_sc, &endpoint, t->t_mbuf);
+
+ if (ret == 0) {
+ wg_timers_event_any_authenticated_packet_traversal(
+ &peer->p_timers);
+ wg_timers_event_any_authenticated_packet_sent(
+ &peer->p_timers);
+
+ if (m->m_pkthdr.len != 0)
+ wg_timers_event_data_sent(&peer->p_timers);
+ counter_u64_add(peer->p_tx_bytes, len);
+ } else if (ret == EADDRNOTAVAIL) {
+ wg_peer_clear_src(peer);
+ wg_peer_get_endpoint(peer, &endpoint);
+ }
+ m_freem(m);
+ }
+done:
+ NET_EPOCH_EXIT(et);
+}
+
+static void
+wg_deliver_in(struct wg_peer *peer)
+{
+ struct mbuf *m;
+ struct ifnet *ifp;
+ struct wg_softc *sc;
+ struct epoch_tracker et;
+ struct wg_tag *t;
+ uint32_t af;
+ int version;
+
+ NET_EPOCH_ENTER(et);
+ sc = peer->p_sc;
+ ifp = sc->sc_ifp;
+
+ while ((m = wg_queue_dequeue(&peer->p_decap_queue, &t)) != NULL) {
+ /* t_mbuf will contain the encrypted packet */
+ if (t->t_mbuf == NULL) {
+ if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
+ m_freem(m);
+ continue;
+ }
+ MPASS(m == t->t_mbuf);
+
+ wg_timers_event_any_authenticated_packet_received(
+ &peer->p_timers);
+ wg_timers_event_any_authenticated_packet_traversal(
+ &peer->p_timers);
+
+ counter_u64_add(peer->p_rx_bytes, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN);
+
+ if (m->m_pkthdr.len == 0) {
+ m_freem(m);
+ continue;
+ }
+
+ m->m_flags &= ~(M_MCAST | M_BCAST);
+ m->m_pkthdr.rcvif = ifp;
+ version = mtod(m, struct ip *)->ip_v;
+ if (version == IPVERSION) {
+ af = AF_INET;
+ BPF_MTAP2(ifp, &af, sizeof(af), m);
+ CURVNET_SET(ifp->if_vnet);
+ ip_input(m);
+ CURVNET_RESTORE();
+ } else if (version == 6) {
+ af = AF_INET6;
+ BPF_MTAP2(ifp, &af, sizeof(af), m);
+ CURVNET_SET(ifp->if_vnet);
+ ip6_input(m);
+ CURVNET_RESTORE();
+ } else
+ m_freem(m);
+
+ wg_timers_event_data_received(&peer->p_timers);
+ }
+ NET_EPOCH_EXIT(et);
+}
+
+static int
+wg_queue_in(struct wg_peer *peer, struct mbuf *m)
+{
+ struct buf_ring *parallel = peer->p_sc->sc_decap_ring;
+ struct wg_queue *serial = &peer->p_decap_queue;
+ struct wg_tag *t;
+ int rc;
+
+ MPASS(wg_tag_get(m) != NULL);
+
+ mtx_lock(&serial->q_mtx);
+ if ((rc = mbufq_enqueue(&serial->q, m)) == ENOBUFS) {
+ m_freem(m);
+ if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
+ } else {
+ m->m_flags |= M_ENQUEUED;
+ rc = buf_ring_enqueue(parallel, m);
+ if (rc == ENOBUFS) {
+ t = wg_tag_get(m);
+ t->t_done = 1;
+ }
+ }
+ mtx_unlock(&serial->q_mtx);
+ return (rc);
+}
+
+static void
+wg_queue_stage(struct wg_peer *peer, struct mbuf *m)
+{
+ struct wg_queue *q = &peer->p_stage_queue;
+ mtx_lock(&q->q_mtx);
+ STAILQ_INSERT_TAIL(&q->q.mq_head, m, m_stailqpkt);
+ q->q.mq_len++;
+ while (mbufq_full(&q->q)) {
+ m = mbufq_dequeue(&q->q);
+ if (m) {
+ m_freem(m);
+ if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
+ }
+ }
+ mtx_unlock(&q->q_mtx);
+}
+
+static void
+wg_queue_out(struct wg_peer *peer)
+{
+ struct buf_ring *parallel = peer->p_sc->sc_encap_ring;
+ struct wg_queue *serial = &peer->p_encap_queue;
+ struct wg_tag *t;
+ struct mbufq staged;
+ struct mbuf *m;
+
+ if (noise_remote_ready(&peer->p_remote) != 0) {
+ if (wg_queue_len(&peer->p_stage_queue))
+ wg_timers_event_want_initiation(&peer->p_timers);
+ return;
+ }
+
+ /* We first "steal" the staged queue to a local queue, so that we can do these
+ * remaining operations without having to hold the staged queue mutex. */
+ STAILQ_INIT(&staged.mq_head);
+ mtx_lock(&peer->p_stage_queue.q_mtx);
+ STAILQ_SWAP(&staged.mq_head, &peer->p_stage_queue.q.mq_head, mbuf);
+ staged.mq_len = peer->p_stage_queue.q.mq_len;
+ peer->p_stage_queue.q.mq_len = 0;
+ staged.mq_maxlen = peer->p_stage_queue.q.mq_maxlen;
+ mtx_unlock(&peer->p_stage_queue.q_mtx);
+
+ while ((m = mbufq_dequeue(&staged)) != NULL) {
+ if ((t = wg_tag_get(m)) == NULL) {
+ m_freem(m);
+ continue;
+ }
+ t->t_peer = peer;
+ mtx_lock(&serial->q_mtx);
+ if (mbufq_enqueue(&serial->q, m) != 0) {
+ m_freem(m);
+ if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
+ } else {
+ m->m_flags |= M_ENQUEUED;
+ if (buf_ring_enqueue(parallel, m)) {
+ t = wg_tag_get(m);
+ t->t_done = 1;
+ }
+ }
+ mtx_unlock(&serial->q_mtx);
+ }
+ wg_encrypt_dispatch(peer->p_sc);
+}
+
+static struct mbuf *
+wg_queue_dequeue(struct wg_queue *q, struct wg_tag **t)
+{
+ struct mbuf *m_, *m;
+
+ m = NULL;
+ mtx_lock(&q->q_mtx);
+ m_ = mbufq_first(&q->q);
+ if (m_ != NULL && (*t = wg_tag_get(m_))->t_done) {
+ m = mbufq_dequeue(&q->q);
+ m->m_flags &= ~M_ENQUEUED;
+ }
+ mtx_unlock(&q->q_mtx);
+ return (m);
+}
+
+static int
+wg_queue_len(struct wg_queue *q)
+{
+ /* This access races. We might consider adding locking here. */
+ return (mbufq_len(&q->q));
+}
+
+static void
+wg_queue_init(struct wg_queue *q, const char *name)
+{
+ mtx_init(&q->q_mtx, name, NULL, MTX_DEF);
+ mbufq_init(&q->q, MAX_QUEUED_PKT);
+}
+
+static void
+wg_queue_deinit(struct wg_queue *q)
+{
+ wg_queue_purge(q);
+ mtx_destroy(&q->q_mtx);
+}
+
+static void
+wg_queue_purge(struct wg_queue *q)
+{
+ mtx_lock(&q->q_mtx);
+ mbufq_drain(&q->q);
+ mtx_unlock(&q->q_mtx);
+}
+
+/* TODO Indexes */
+static struct noise_remote *
+wg_remote_get(struct wg_softc *sc, uint8_t public[NOISE_PUBLIC_KEY_LEN])
+{
+ struct wg_peer *peer;
+
+ if ((peer = wg_peer_lookup(sc, public)) == NULL)
+ return (NULL);
+ return (&peer->p_remote);
+}
+
+static uint32_t
+wg_index_set(struct wg_softc *sc, struct noise_remote *remote)
+{
+ struct wg_index *index, *iter;
+ struct wg_peer *peer;
+ uint32_t key;
+
+ /* We can modify this without a lock as wg_index_set, wg_index_drop are
+ * guaranteed to be serialised (per remote). */
+ peer = __containerof(remote, struct wg_peer, p_remote);
+ index = SLIST_FIRST(&peer->p_unused_index);
+ MPASS(index != NULL);
+ SLIST_REMOVE_HEAD(&peer->p_unused_index, i_unused_entry);
+
+ index->i_value = remote;
+
+ rw_wlock(&sc->sc_index_lock);
+assign_id:
+ key = index->i_key = arc4random();
+ key &= sc->sc_index_mask;
+ LIST_FOREACH(iter, &sc->sc_index[key], i_entry)
+ if (iter->i_key == index->i_key)
+ goto assign_id;
+
+ LIST_INSERT_HEAD(&sc->sc_index[key], index, i_entry);
+
+ rw_wunlock(&sc->sc_index_lock);
+
+ /* Likewise, no need to lock for index here. */
+ return index->i_key;
+}
+
+static struct noise_remote *
+wg_index_get(struct wg_softc *sc, uint32_t key0)
+{
+ struct wg_index *iter;
+ struct noise_remote *remote = NULL;
+ uint32_t key = key0 & sc->sc_index_mask;
+
+ rw_enter_read(&sc->sc_index_lock);
+ LIST_FOREACH(iter, &sc->sc_index[key], i_entry)
+ if (iter->i_key == key0) {
+ remote = iter->i_value;
+ break;
+ }
+ rw_exit_read(&sc->sc_index_lock);
+ return remote;
+}
+
+static void
+wg_index_drop(struct wg_softc *sc, uint32_t key0)
+{
+ struct wg_index *iter;
+ struct wg_peer *peer = NULL;
+ uint32_t key = key0 & sc->sc_index_mask;
+
+ rw_enter_write(&sc->sc_index_lock);
+ LIST_FOREACH(iter, &sc->sc_index[key], i_entry)
+ if (iter->i_key == key0) {
+ LIST_REMOVE(iter, i_entry);
+ break;
+ }
+ rw_exit_write(&sc->sc_index_lock);
+
+ if (iter == NULL)
+ return;
+
+ /* We expect a peer */
+ peer = __containerof(iter->i_value, struct wg_peer, p_remote);
+ MPASS(peer != NULL);
+ SLIST_INSERT_HEAD(&peer->p_unused_index, iter, i_unused_entry);
+}
+
+static int
+wg_update_endpoint_addrs(struct wg_endpoint *e, const struct sockaddr *srcsa,
+ struct ifnet *rcvif)
+{
+ const struct sockaddr_in *sa4;
+ const struct sockaddr_in6 *sa6;
+ int ret = 0;
+
+ /*
+ * UDP passes a 2-element sockaddr array: first element is the
+ * source addr/port, second the destination addr/port.
+ */
+ if (srcsa->sa_family == AF_INET) {
+ sa4 = (const struct sockaddr_in *)srcsa;
+ e->e_remote.r_sin = sa4[0];
+ e->e_local.l_in = sa4[1].sin_addr;
+ } else if (srcsa->sa_family == AF_INET6) {
+ sa6 = (const struct sockaddr_in6 *)srcsa;
+ e->e_remote.r_sin6 = sa6[0];
+ e->e_local.l_in6 = sa6[1].sin6_addr;
+ } else {
+ ret = EAFNOSUPPORT;
+ }
+
+ return (ret);
+}
+
+static void
+wg_input(struct mbuf *m0, int offset, struct inpcb *inpcb,
+ const struct sockaddr *srcsa, void *_sc)
+{
+ struct wg_pkt_data *pkt_data;
+ struct wg_endpoint *e;
+ struct wg_softc *sc = _sc;
+ struct mbuf *m;
+ int pktlen, pkttype;
+ struct noise_remote *remote;
+ struct wg_tag *t;
+ void *data;
+
+ /* Caller provided us with srcsa, no need for this header. */
+ m_adj(m0, offset + sizeof(struct udphdr));
+
+ /*
+ * Ensure mbuf has at least enough contiguous data to peel off our
+ * headers at the beginning.
+ */
+ if ((m = m_defrag(m0, M_NOWAIT)) == NULL) {
+ m_freem(m0);
+ return;
+ }
+ data = mtod(m, void *);
+ pkttype = *(uint32_t*)data;
+ t = wg_tag_get(m);
+ if (t == NULL) {
+ goto free;
+ }
+ e = wg_mbuf_endpoint_get(m);
+
+ if (wg_update_endpoint_addrs(e, srcsa, m->m_pkthdr.rcvif)) {
+ goto free;
+ }
+
+ pktlen = m->m_pkthdr.len;
+
+ if ((pktlen == sizeof(struct wg_pkt_initiation) &&
+ pkttype == WG_PKT_INITIATION) ||
+ (pktlen == sizeof(struct wg_pkt_response) &&
+ pkttype == WG_PKT_RESPONSE) ||
+ (pktlen == sizeof(struct wg_pkt_cookie) &&
+ pkttype == WG_PKT_COOKIE)) {
+ if (mbufq_enqueue(&sc->sc_handshake_queue, m) == 0) {
+ GROUPTASK_ENQUEUE(&sc->sc_handshake);
+ } else {
+ DPRINTF(sc, "Dropping handshake packet\n");
+ m_freem(m);
+ }
+ } else if (pktlen >= sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN
+ && pkttype == WG_PKT_DATA) {
+
+ pkt_data = data;
+ remote = wg_index_get(sc, pkt_data->r_idx);
+ if (remote == NULL) {
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1);
+ m_freem(m);
+ } else if (buf_ring_count(sc->sc_decap_ring) > MAX_QUEUED_PKT) {
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1);
+ m_freem(m);
+ } else {
+ t->t_peer = __containerof(remote, struct wg_peer,
+ p_remote);
+ t->t_mbuf = NULL;
+ t->t_done = 0;
+
+ wg_queue_in(t->t_peer, m);
+ wg_decrypt_dispatch(sc);
+ }
+ } else {
+free:
+ m_freem(m);
+ }
+}
+
+static int
+wg_transmit(struct ifnet *ifp, struct mbuf *m)
+{
+ struct wg_softc *sc;
+ sa_family_t family;
+ struct epoch_tracker et;
+ struct wg_peer *peer;
+ struct wg_tag *t;
+ uint32_t af;
+ int rc;
+
+ /*
+ * Work around lifetime issue in the ipv6 mld code.
+ */
+ if (__predict_false(ifp->if_flags & IFF_DYING))
+ return (ENXIO);
+
+ rc = 0;
+ sc = ifp->if_softc;
+ if ((t = wg_tag_get(m)) == NULL) {
+ rc = ENOBUFS;
+ goto early_out;
+ }
+ af = m->m_pkthdr.ph_family;
+ BPF_MTAP2(ifp, &af, sizeof(af), m);
+
+ NET_EPOCH_ENTER(et);
+ peer = wg_aip_lookup(&sc->sc_aips, m, OUT);
+ if (__predict_false(peer == NULL)) {
+ rc = ENOKEY;
+ goto err;
+ }
+
+ family = peer->p_endpoint.e_remote.r_sa.sa_family;
+ if (__predict_false(family != AF_INET && family != AF_INET6)) {
+ DPRINTF(sc, "No valid endpoint has been configured or "
+ "discovered for peer %llu\n", (unsigned long long)peer->p_id);
+
+ rc = EHOSTUNREACH;
+ goto err;
+ }
+ t->t_peer = peer;
+ t->t_mbuf = NULL;
+ t->t_done = 0;
+ t->t_mtu = ifp->if_mtu;
+
+ wg_queue_stage(peer, m);
+ wg_queue_out(peer);
+ NET_EPOCH_EXIT(et);
+ return (rc);
+err:
+ NET_EPOCH_EXIT(et);
+early_out:
+ if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
+ /* TODO: send ICMP unreachable */
+ m_free(m);
+ return (rc);
+}
+
+static int
+wg_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa, struct route *rt)
+{
+ m->m_pkthdr.ph_family = sa->sa_family;
+ return (wg_transmit(ifp, m));
+}
+
+static int
+wg_peer_add(struct wg_softc *sc, const nvlist_t *nvl)
+{
+ uint8_t public[WG_KEY_SIZE];
+ const void *pub_key;
+ const struct sockaddr *endpoint;
+ int err;
+ size_t size;
+ struct wg_peer *peer = NULL;
+ bool need_insert = false;
+
+ sx_assert(&sc->sc_lock, SX_XLOCKED);
+
+ if (!nvlist_exists_binary(nvl, "public-key")) {
+ return (EINVAL);
+ }
+ pub_key = nvlist_get_binary(nvl, "public-key", &size);
+ if (size != WG_KEY_SIZE) {
+ return (EINVAL);
+ }
+ if (noise_local_keys(&sc->sc_local, public, NULL) == 0 &&
+ bcmp(public, pub_key, WG_KEY_SIZE) == 0) {
+ return (0); // Silently ignored; not actually a failure.
+ }
+ peer = wg_peer_lookup(sc, pub_key);
+ if (nvlist_exists_bool(nvl, "remove") &&
+ nvlist_get_bool(nvl, "remove")) {
+ if (peer != NULL) {
+ wg_hashtable_peer_remove(&sc->sc_hashtable, peer);
+ wg_peer_destroy(peer);
+ }
+ return (0);
+ }
+ if (nvlist_exists_bool(nvl, "replace-allowedips") &&
+ nvlist_get_bool(nvl, "replace-allowedips") &&
+ peer != NULL) {
+
+ wg_aip_delete(&peer->p_sc->sc_aips, peer);
+ }
+ if (peer == NULL) {
+ if (sc->sc_peer_count >= MAX_PEERS_PER_IFACE)
+ return (E2BIG);
+ sc->sc_peer_count++;
+
+ need_insert = true;
+ peer = wg_peer_alloc(sc);
+ MPASS(peer != NULL);
+ noise_remote_init(&peer->p_remote, pub_key, &sc->sc_local);
+ cookie_maker_init(&peer->p_cookie, pub_key);
+ }
+ if (nvlist_exists_binary(nvl, "endpoint")) {
+ endpoint = nvlist_get_binary(nvl, "endpoint", &size);
+ if (size > sizeof(peer->p_endpoint.e_remote)) {
+ err = EINVAL;
+ goto out;
+ }
+ memcpy(&peer->p_endpoint.e_remote, endpoint, size);
+ }
+ if (nvlist_exists_binary(nvl, "preshared-key")) {
+ const void *key;
+
+ key = nvlist_get_binary(nvl, "preshared-key", &size);
+ if (size != WG_KEY_SIZE) {
+ err = EINVAL;
+ goto out;
+ }
+ noise_remote_set_psk(&peer->p_remote, key);
+ }
+ if (nvlist_exists_number(nvl, "persistent-keepalive-interval")) {
+ uint64_t pki = nvlist_get_number(nvl, "persistent-keepalive-interval");
+ if (pki > UINT16_MAX) {
+ err = EINVAL;
+ goto out;
+ }
+ wg_timers_set_persistent_keepalive(&peer->p_timers, pki);
+ }
+ if (nvlist_exists_nvlist_array(nvl, "allowed-ips")) {
+ const void *binary;
+ uint64_t cidr;
+ const nvlist_t * const * aipl;
+ struct wg_allowedip aip;
+ size_t allowedip_count;
+
+ aipl = nvlist_get_nvlist_array(nvl, "allowed-ips",
+ &allowedip_count);
+ for (size_t idx = 0; idx < allowedip_count; idx++) {
+ if (!nvlist_exists_number(aipl[idx], "cidr"))
+ continue;
+ cidr = nvlist_get_number(aipl[idx], "cidr");
+ if (nvlist_exists_binary(aipl[idx], "ipv4")) {
+ binary = nvlist_get_binary(aipl[idx], "ipv4", &size);
+ if (binary == NULL || cidr > 32 || size != sizeof(aip.ip4)) {
+ err = EINVAL;
+ goto out;
+ }
+ aip.family = AF_INET;
+ memcpy(&aip.ip4, binary, sizeof(aip.ip4));
+ } else if (nvlist_exists_binary(aipl[idx], "ipv6")) {
+ binary = nvlist_get_binary(aipl[idx], "ipv6", &size);
+ if (binary == NULL || cidr > 128 || size != sizeof(aip.ip6)) {
+ err = EINVAL;
+ goto out;
+ }
+ aip.family = AF_INET6;
+ memcpy(&aip.ip6, binary, sizeof(aip.ip6));
+ } else {
+ continue;
+ }
+ aip.cidr = cidr;
+
+ if ((err = wg_aip_add(&sc->sc_aips, peer, &aip)) != 0) {
+ goto out;
+ }
+ }
+ }
+ if (need_insert) {
+ wg_hashtable_peer_insert(&sc->sc_hashtable, peer);
+ if (sc->sc_ifp->if_link_state == LINK_STATE_UP)
+ wg_timers_enable(&peer->p_timers);
+ }
+ return (0);
+
+out:
+ if (need_insert) /* If we fail, only destroy if it was new. */
+ wg_peer_destroy(peer);
+ return (err);
+}
+
+static int
+wgc_set(struct wg_softc *sc, struct wg_data_io *wgd)
+{
+ uint8_t public[WG_KEY_SIZE], private[WG_KEY_SIZE];
+ struct ifnet *ifp;
+ void *nvlpacked;
+ nvlist_t *nvl;
+ ssize_t size;
+ int err;
+
+ ifp = sc->sc_ifp;
+ if (wgd->wgd_size == 0 || wgd->wgd_data == NULL)
+ return (EFAULT);
+
+ sx_xlock(&sc->sc_lock);
+
+ nvlpacked = malloc(wgd->wgd_size, M_TEMP, M_WAITOK);
+ err = copyin(wgd->wgd_data, nvlpacked, wgd->wgd_size);
+ if (err)
+ goto out;
+ nvl = nvlist_unpack(nvlpacked, wgd->wgd_size, 0);
+ if (nvl == NULL) {
+ err = EBADMSG;
+ goto out;
+ }
+ if (nvlist_exists_bool(nvl, "replace-peers") &&
+ nvlist_get_bool(nvl, "replace-peers"))
+ wg_peer_remove_all(sc);
+ if (nvlist_exists_number(nvl, "listen-port")) {
+ uint64_t new_port = nvlist_get_number(nvl, "listen-port");
+ if (new_port > UINT16_MAX) {
+ err = EINVAL;
+ goto out;
+ }
+ if (new_port != sc->sc_socket.so_port) {
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
+ if ((err = wg_socket_init(sc, new_port)) != 0)
+ goto out;
+ } else
+ sc->sc_socket.so_port = new_port;
+ }
+ }
+ if (nvlist_exists_binary(nvl, "private-key")) {
+ const void *key = nvlist_get_binary(nvl, "private-key", &size);
+ if (size != WG_KEY_SIZE) {
+ err = EINVAL;
+ goto out;
+ }
+
+ if (noise_local_keys(&sc->sc_local, NULL, private) != 0 ||
+ timingsafe_bcmp(private, key, WG_KEY_SIZE) != 0) {
+ struct noise_local *local;
+ struct wg_peer *peer;
+ struct wg_hashtable *ht = &sc->sc_hashtable;
+ bool has_identity;
+
+ if (curve25519_generate_public(public, key)) {
+ /* Peer conflict: remove conflicting peer. */
+ if ((peer = wg_peer_lookup(sc, public)) !=
+ NULL) {
+ wg_hashtable_peer_remove(ht, peer);
+ wg_peer_destroy(peer);
+ }
+ }
+
+ /*
+ * Set the private key and invalidate all existing
+ * handshakes.
+ */
+ local = &sc->sc_local;
+ noise_local_lock_identity(local);
+ /* Note: we might be removing the private key. */
+ has_identity = noise_local_set_private(local, key) == 0;
+ mtx_lock(&ht->h_mtx);
+ CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) {
+ noise_remote_precompute(&peer->p_remote);
+ wg_timers_event_reset_handshake_last_sent(
+ &peer->p_timers);
+ noise_remote_expire_current(&peer->p_remote);
+ }
+ mtx_unlock(&ht->h_mtx);
+ cookie_checker_update(&sc->sc_cookie,
+ has_identity ? public : NULL);
+ noise_local_unlock_identity(local);
+ }
+ }
+ if (nvlist_exists_number(nvl, "user-cookie")) {
+ uint64_t user_cookie = nvlist_get_number(nvl, "user-cookie");
+ if (user_cookie > UINT32_MAX) {
+ err = EINVAL;
+ goto out;
+ }
+ wg_socket_set_cookie(sc, user_cookie);
+ }
+ if (nvlist_exists_nvlist_array(nvl, "peers")) {
+ size_t peercount;
+ const nvlist_t * const*nvl_peers;
+
+ nvl_peers = nvlist_get_nvlist_array(nvl, "peers", &peercount);
+ for (int i = 0; i < peercount; i++) {
+ err = wg_peer_add(sc, nvl_peers[i]);
+ if (err != 0)
+ goto out;
+ }
+ }
+
+ nvlist_destroy(nvl);
+out:
+ free(nvlpacked, M_TEMP);
+ sx_xunlock(&sc->sc_lock);
+ return (err);
+}
+
+static unsigned int
+in_mask2len(struct in_addr *mask)
+{
+ unsigned int x, y;
+ uint8_t *p;
+
+ p = (uint8_t *)mask;
+ for (x = 0; x < sizeof(*mask); x++) {
+ if (p[x] != 0xff)
+ break;
+ }
+ y = 0;
+ if (x < sizeof(*mask)) {
+ for (y = 0; y < NBBY; y++) {
+ if ((p[x] & (0x80 >> y)) == 0)
+ break;
+ }
+ }
+ return x * NBBY + y;
+}
+
+static int
+wg_peer_to_export(struct wg_peer *peer, struct wg_peer_export *exp)
+{
+ struct wg_endpoint *ep;
+ struct wg_aip *rt;
+ struct noise_remote *remote;
+ int i;
+
+ /* Non-sleepable context. */
+ NET_EPOCH_ASSERT();
+
+ bzero(&exp->endpoint, sizeof(exp->endpoint));
+ remote = &peer->p_remote;
+ ep = &peer->p_endpoint;
+ if (ep->e_remote.r_sa.sa_family != 0) {
+ exp->endpoint_sz = (ep->e_remote.r_sa.sa_family == AF_INET) ?
+ sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+
+ memcpy(&exp->endpoint, &ep->e_remote, exp->endpoint_sz);
+ }
+
+ /* We always export it. */
+ (void)noise_remote_keys(remote, exp->public_key, exp->preshared_key);
+ exp->persistent_keepalive =
+ peer->p_timers.t_persistent_keepalive_interval;
+ wg_timers_get_last_handshake(&peer->p_timers, &exp->last_handshake);
+ exp->rx_bytes = counter_u64_fetch(peer->p_rx_bytes);
+ exp->tx_bytes = counter_u64_fetch(peer->p_tx_bytes);
+
+ exp->aip_count = 0;
+ CK_LIST_FOREACH(rt, &peer->p_aips, r_entry) {
+ exp->aip_count++;
+ }
+
+ /* Early success; no allowed-ips to copy out. */
+ if (exp->aip_count == 0)
+ return (0);
+
+ exp->aip = malloc(exp->aip_count * sizeof(*exp->aip), M_TEMP, M_NOWAIT);
+ if (exp->aip == NULL)
+ return (ENOMEM);
+
+ i = 0;
+ CK_LIST_FOREACH(rt, &peer->p_aips, r_entry) {
+ exp->aip[i].family = rt->r_addr.ss_family;
+ if (exp->aip[i].family == AF_INET) {
+ struct sockaddr_in *sin =
+ (struct sockaddr_in *)&rt->r_addr;
+
+ exp->aip[i].ip4 = sin->sin_addr;
+
+ sin = (struct sockaddr_in *)&rt->r_mask;
+ exp->aip[i].cidr = in_mask2len(&sin->sin_addr);
+ } else if (exp->aip[i].family == AF_INET6) {
+ struct sockaddr_in6 *sin6 =
+ (struct sockaddr_in6 *)&rt->r_addr;
+
+ exp->aip[i].ip6 = sin6->sin6_addr;
+
+ sin6 = (struct sockaddr_in6 *)&rt->r_mask;
+ exp->aip[i].cidr = in6_mask2len(&sin6->sin6_addr, NULL);
+ }
+ i++;
+ if (i == exp->aip_count)
+ break;
+ }
+
+ /* Again, AllowedIPs might have shrank; update it. */
+ exp->aip_count = i;
+
+ return (0);
+}
+
+static nvlist_t *
+wg_peer_export_to_nvl(struct wg_softc *sc, struct wg_peer_export *exp)
+{
+ struct wg_timespec64 ts64;
+ nvlist_t *nvl, **nvl_aips;
+ size_t i;
+ uint16_t family;
+
+ nvl_aips = NULL;
+ if ((nvl = nvlist_create(0)) == NULL)
+ return (NULL);
+
+ nvlist_add_binary(nvl, "public-key", exp->public_key,
+ sizeof(exp->public_key));
+ if (wgc_privileged(sc))
+ nvlist_add_binary(nvl, "preshared-key", exp->preshared_key,
+ sizeof(exp->preshared_key));
+ if (exp->endpoint_sz != 0)
+ nvlist_add_binary(nvl, "endpoint", &exp->endpoint,
+ exp->endpoint_sz);
+
+ if (exp->aip_count != 0) {
+ nvl_aips = mallocarray(exp->aip_count, sizeof(*nvl_aips),
+ M_WG, M_WAITOK | M_ZERO);
+ }
+
+ for (i = 0; i < exp->aip_count; i++) {
+ nvl_aips[i] = nvlist_create(0);
+ if (nvl_aips[i] == NULL)
+ goto err;
+ family = exp->aip[i].family;
+ nvlist_add_number(nvl_aips[i], "cidr", exp->aip[i].cidr);
+ if (family == AF_INET)
+ nvlist_add_binary(nvl_aips[i], "ipv4",
+ &exp->aip[i].ip4, sizeof(exp->aip[i].ip4));
+ else if (family == AF_INET6)
+ nvlist_add_binary(nvl_aips[i], "ipv6",
+ &exp->aip[i].ip6, sizeof(exp->aip[i].ip6));
+ }
+
+ if (i != 0) {
+ nvlist_add_nvlist_array(nvl, "allowed-ips",
+ (const nvlist_t *const *)nvl_aips, i);
+ }
+
+ for (i = 0; i < exp->aip_count; ++i)
+ nvlist_destroy(nvl_aips[i]);
+
+ free(nvl_aips, M_WG);
+ nvl_aips = NULL;
+
+ ts64.tv_sec = exp->last_handshake.tv_sec;
+ ts64.tv_nsec = exp->last_handshake.tv_nsec;
+ nvlist_add_binary(nvl, "last-handshake-time", &ts64, sizeof(ts64));
+
+ if (exp->persistent_keepalive != 0)
+ nvlist_add_number(nvl, "persistent-keepalive-interval",
+ exp->persistent_keepalive);
+
+ if (exp->rx_bytes != 0)
+ nvlist_add_number(nvl, "rx-bytes", exp->rx_bytes);
+ if (exp->tx_bytes != 0)
+ nvlist_add_number(nvl, "tx-bytes", exp->tx_bytes);
+
+ return (nvl);
+err:
+ for (i = 0; i < exp->aip_count && nvl_aips[i] != NULL; i++) {
+ nvlist_destroy(nvl_aips[i]);
+ }
+
+ free(nvl_aips, M_WG);
+ nvlist_destroy(nvl);
+ return (NULL);
+}
+
+static int
+wg_marshal_peers(struct wg_softc *sc, nvlist_t **nvlp, nvlist_t ***nvl_arrayp, int *peer_countp)
+{
+ struct wg_peer *peer;
+ int err, i, peer_count;
+ nvlist_t *nvl, **nvl_array;
+ struct epoch_tracker et;
+ struct wg_peer_export *wpe;
+
+ nvl = NULL;
+ nvl_array = NULL;
+ if (nvl_arrayp)
+ *nvl_arrayp = NULL;
+ if (nvlp)
+ *nvlp = NULL;
+ if (peer_countp)
+ *peer_countp = 0;
+ peer_count = sc->sc_hashtable.h_num_peers;
+ if (peer_count == 0) {
+ return (ENOENT);
+ }
+
+ if (nvlp && (nvl = nvlist_create(0)) == NULL)
+ return (ENOMEM);
+
+ err = i = 0;
+ nvl_array = malloc(peer_count*sizeof(void*), M_TEMP, M_WAITOK | M_ZERO);
+ wpe = malloc(peer_count*sizeof(*wpe), M_TEMP, M_WAITOK | M_ZERO);
+
+ NET_EPOCH_ENTER(et);
+ CK_LIST_FOREACH(peer, &sc->sc_hashtable.h_peers_list, p_entry) {
+ if ((err = wg_peer_to_export(peer, &wpe[i])) != 0) {
+ break;
+ }
+
+ i++;
+ if (i == peer_count)
+ break;
+ }
+ NET_EPOCH_EXIT(et);
+
+ if (err != 0)
+ goto out;
+
+ /* Update the peer count, in case we found fewer entries. */
+ *peer_countp = peer_count = i;
+ if (peer_count == 0) {
+ err = ENOENT;
+ goto out;
+ }
+
+ for (i = 0; i < peer_count; i++) {
+ int idx;
+
+ /*
+ * Peers are added to the list in reverse order, effectively,
+ * because it's simpler/quicker to add at the head every time.
+ *
+ * Export them in reverse order. No worries if we fail mid-way
+ * through, the cleanup below will DTRT.
+ */
+ idx = peer_count - i - 1;
+ nvl_array[idx] = wg_peer_export_to_nvl(sc, &wpe[i]);
+ if (nvl_array[idx] == NULL) {
+ break;
+ }
+ }
+
+ if (i < peer_count) {
+ /* Error! */
+ *peer_countp = 0;
+ err = ENOMEM;
+ } else if (nvl) {
+ nvlist_add_nvlist_array(nvl, "peers",
+ (const nvlist_t * const *)nvl_array, peer_count);
+ if ((err = nvlist_error(nvl))) {
+ goto out;
+ }
+ *nvlp = nvl;
+ }
+ *nvl_arrayp = nvl_array;
+ out:
+ if (err != 0) {
+ /* Note that nvl_array is populated in reverse order. */
+ for (i = 0; i < peer_count; i++) {
+ nvlist_destroy(nvl_array[i]);
+ }
+
+ free(nvl_array, M_TEMP);
+ if (nvl != NULL)
+ nvlist_destroy(nvl);
+ }
+
+ for (i = 0; i < peer_count; i++)
+ free(wpe[i].aip, M_TEMP);
+ free(wpe, M_TEMP);
+ return (err);
+}
+
+static int
+wgc_get(struct wg_softc *sc, struct wg_data_io *wgd)
+{
+ nvlist_t *nvl, **nvl_array;
+ void *packed;
+ size_t size;
+ int peer_count, err;
+
+ nvl = nvlist_create(0);
+ if (nvl == NULL)
+ return (ENOMEM);
+
+ sx_slock(&sc->sc_lock);
+
+ err = 0;
+ packed = NULL;
+ if (sc->sc_socket.so_port != 0)
+ nvlist_add_number(nvl, "listen-port", sc->sc_socket.so_port);
+ if (sc->sc_socket.so_user_cookie != 0)
+ nvlist_add_number(nvl, "user-cookie", sc->sc_socket.so_user_cookie);
+ if (sc->sc_local.l_has_identity) {
+ nvlist_add_binary(nvl, "public-key", sc->sc_local.l_public, WG_KEY_SIZE);
+ if (wgc_privileged(sc))
+ nvlist_add_binary(nvl, "private-key", sc->sc_local.l_private, WG_KEY_SIZE);
+ }
+ if (sc->sc_hashtable.h_num_peers > 0) {
+ err = wg_marshal_peers(sc, NULL, &nvl_array, &peer_count);
+ if (err)
+ goto out_nvl;
+ nvlist_add_nvlist_array(nvl, "peers",
+ (const nvlist_t * const *)nvl_array, peer_count);
+ }
+ packed = nvlist_pack(nvl, &size);
+ if (packed == NULL) {
+ err = ENOMEM;
+ goto out_nvl;
+ }
+ if (wgd->wgd_size == 0) {
+ wgd->wgd_size = size;
+ goto out_packed;
+ }
+ if (wgd->wgd_size < size) {
+ err = ENOSPC;
+ goto out_packed;
+ }
+ if (wgd->wgd_data == NULL) {
+ err = EFAULT;
+ goto out_packed;
+ }
+ err = copyout(packed, wgd->wgd_data, size);
+ wgd->wgd_size = size;
+
+out_packed:
+ free(packed, M_NVLIST);
+out_nvl:
+ nvlist_destroy(nvl);
+ sx_sunlock(&sc->sc_lock);
+ return (err);
+}
+
+static int
+wg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ struct wg_data_io *wgd = (struct wg_data_io *)data;
+ struct ifreq *ifr = (struct ifreq *)data;
+ struct wg_softc *sc = ifp->if_softc;
+ int ret = 0;
+
+ switch (cmd) {
+ case SIOCSWG:
+ ret = priv_check(curthread, PRIV_NET_WG);
+ if (ret == 0)
+ ret = wgc_set(sc, wgd);
+ break;
+ case SIOCGWG:
+ ret = wgc_get(sc, wgd);
+ break;
+ /* Interface IOCTLs */
+ case SIOCSIFADDR:
+ /*
+ * This differs from *BSD norms, but is more uniform with how
+ * WireGuard behaves elsewhere.
+ */
+ break;
+ case SIOCSIFFLAGS:
+ if ((ifp->if_flags & IFF_UP) != 0)
+ ret = wg_up(sc);
+ else
+ wg_down(sc);
+ break;
+ case SIOCSIFMTU:
+ if (ifr->ifr_mtu <= 0 || ifr->ifr_mtu > MAX_MTU)
+ ret = EINVAL;
+ else
+ ifp->if_mtu = ifr->ifr_mtu;
+ break;
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ break;
+ default:
+ ret = ENOTTY;
+ }
+
+ return ret;
+}
+
+static int
+wg_up(struct wg_softc *sc)
+{
+ struct wg_hashtable *ht = &sc->sc_hashtable;
+ struct ifnet *ifp = sc->sc_ifp;
+ struct wg_peer *peer;
+ int rc = EBUSY;
+
+ sx_xlock(&sc->sc_lock);
+ /* Jail's being removed, no more wg_up(). */
+ if ((sc->sc_flags & WGF_DYING) != 0)
+ goto out;
+
+ /* Silent success if we're already running. */
+ rc = 0;
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ goto out;
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+
+ rc = wg_socket_init(sc, sc->sc_socket.so_port);
+ if (rc == 0) {
+ mtx_lock(&ht->h_mtx);
+ CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) {
+ wg_timers_enable(&peer->p_timers);
+ wg_queue_out(peer);
+ }
+ mtx_unlock(&ht->h_mtx);
+
+ if_link_state_change(sc->sc_ifp, LINK_STATE_UP);
+ } else {
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ }
+out:
+ sx_xunlock(&sc->sc_lock);
+ return (rc);
+}
+
+static void
+wg_down(struct wg_softc *sc)
+{
+ struct wg_hashtable *ht = &sc->sc_hashtable;
+ struct ifnet *ifp = sc->sc_ifp;
+ struct wg_peer *peer;
+
+ sx_xlock(&sc->sc_lock);
+ if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+ sx_xunlock(&sc->sc_lock);
+ return;
+ }
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+
+ mtx_lock(&ht->h_mtx);
+ CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) {
+ wg_queue_purge(&peer->p_stage_queue);
+ wg_timers_disable(&peer->p_timers);
+ }
+ mtx_unlock(&ht->h_mtx);
+
+ mbufq_drain(&sc->sc_handshake_queue);
+
+ mtx_lock(&ht->h_mtx);
+ CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) {
+ noise_remote_clear(&peer->p_remote);
+ wg_timers_event_reset_handshake_last_sent(&peer->p_timers);
+ }
+ mtx_unlock(&ht->h_mtx);
+
+ if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN);
+ wg_socket_uninit(sc);
+
+ sx_xunlock(&sc->sc_lock);
+}
+
+static void
+crypto_taskq_setup(struct wg_softc *sc)
+{
+
+ sc->sc_encrypt = malloc(sizeof(struct grouptask)*mp_ncpus, M_WG, M_WAITOK);
+ sc->sc_decrypt = malloc(sizeof(struct grouptask)*mp_ncpus, M_WG, M_WAITOK);
+
+ for (int i = 0; i < mp_ncpus; i++) {
+ GROUPTASK_INIT(&sc->sc_encrypt[i], 0,
+ (gtask_fn_t *)wg_softc_encrypt, sc);
+ taskqgroup_attach_cpu(qgroup_if_io_tqg, &sc->sc_encrypt[i], sc, i, NULL, NULL, "wg encrypt");
+ GROUPTASK_INIT(&sc->sc_decrypt[i], 0,
+ (gtask_fn_t *)wg_softc_decrypt, sc);
+ taskqgroup_attach_cpu(qgroup_if_io_tqg, &sc->sc_decrypt[i], sc, i, NULL, NULL, "wg decrypt");
+ }
+}
+
+static void
+crypto_taskq_destroy(struct wg_softc *sc)
+{
+ for (int i = 0; i < mp_ncpus; i++) {
+ taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_encrypt[i]);
+ taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_decrypt[i]);
+ }
+ free(sc->sc_encrypt, M_WG);
+ free(sc->sc_decrypt, M_WG);
+}
+
+static int
+wg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
+{
+ struct wg_softc *sc;
+ struct ifnet *ifp;
+ struct noise_upcall noise_upcall;
+
+ sc = malloc(sizeof(*sc), M_WG, M_WAITOK | M_ZERO);
+ sc->sc_ucred = crhold(curthread->td_ucred);
+ ifp = sc->sc_ifp = if_alloc(IFT_WIREGUARD);
+ ifp->if_softc = sc;
+ if_initname(ifp, wgname, unit);
+
+ noise_upcall.u_arg = sc;
+ noise_upcall.u_remote_get =
+ (struct noise_remote *(*)(void *, uint8_t *))wg_remote_get;
+ noise_upcall.u_index_set =
+ (uint32_t (*)(void *, struct noise_remote *))wg_index_set;
+ noise_upcall.u_index_drop =
+ (void (*)(void *, uint32_t))wg_index_drop;
+ noise_local_init(&sc->sc_local, &noise_upcall);
+ cookie_checker_init(&sc->sc_cookie, ratelimit_zone);
+
+ sc->sc_socket.so_port = 0;
+
+ atomic_add_int(&clone_count, 1);
+ ifp->if_capabilities = ifp->if_capenable = WG_CAPS;
+
+ mbufq_init(&sc->sc_handshake_queue, MAX_QUEUED_HANDSHAKES);
+ sx_init(&sc->sc_lock, "wg softc lock");
+ rw_init(&sc->sc_index_lock, "wg index lock");
+ sc->sc_peer_count = 0;
+ sc->sc_encap_ring = buf_ring_alloc(MAX_QUEUED_PKT, M_WG, M_WAITOK, NULL);
+ sc->sc_decap_ring = buf_ring_alloc(MAX_QUEUED_PKT, M_WG, M_WAITOK, NULL);
+ GROUPTASK_INIT(&sc->sc_handshake, 0,
+ (gtask_fn_t *)wg_softc_handshake_receive, sc);
+ taskqgroup_attach(qgroup_if_io_tqg, &sc->sc_handshake, sc, NULL, NULL, "wg tx initiation");
+ crypto_taskq_setup(sc);
+
+ wg_hashtable_init(&sc->sc_hashtable);
+ sc->sc_index = hashinit(HASHTABLE_INDEX_SIZE, M_DEVBUF, &sc->sc_index_mask);
+ wg_aip_init(&sc->sc_aips);
+
+ if_setmtu(ifp, ETHERMTU - 80);
+ ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_NOARP;
+ ifp->if_init = wg_init;
+ ifp->if_reassign = wg_reassign;
+ ifp->if_qflush = wg_qflush;
+ ifp->if_transmit = wg_transmit;
+ ifp->if_output = wg_output;
+ ifp->if_ioctl = wg_ioctl;
+
+ if_attach(ifp);
+ bpfattach(ifp, DLT_NULL, sizeof(uint32_t));
+
+ sx_xlock(&wg_sx);
+ LIST_INSERT_HEAD(&wg_list, sc, sc_entry);
+ sx_xunlock(&wg_sx);
+
+ return 0;
+}
+
+static void
+wg_clone_destroy(struct ifnet *ifp)
+{
+ struct wg_softc *sc = ifp->if_softc;
+ struct ucred *cred;
+
+ sx_xlock(&wg_sx);
+ sx_xlock(&sc->sc_lock);
+ sc->sc_flags |= WGF_DYING;
+ cred = sc->sc_ucred;
+ sc->sc_ucred = NULL;
+ sx_xunlock(&sc->sc_lock);
+ LIST_REMOVE(sc, sc_entry);
+ sx_xunlock(&wg_sx);
+
+ if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN);
+
+ sx_xlock(&sc->sc_lock);
+ wg_socket_uninit(sc);
+ sx_xunlock(&sc->sc_lock);
+
+ /*
+ * No guarantees that all traffic have passed until the epoch has
+ * elapsed with the socket closed.
+ */
+ NET_EPOCH_WAIT();
+
+ taskqgroup_drain_all(qgroup_if_io_tqg);
+ sx_xlock(&sc->sc_lock);
+ wg_peer_remove_all(sc);
+ epoch_drain_callbacks(net_epoch_preempt);
+ sx_xunlock(&sc->sc_lock);
+ sx_destroy(&sc->sc_lock);
+ rw_destroy(&sc->sc_index_lock);
+ taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_handshake);
+ crypto_taskq_destroy(sc);
+ buf_ring_free(sc->sc_encap_ring, M_WG);
+ buf_ring_free(sc->sc_decap_ring, M_WG);
+
+ wg_aip_destroy(&sc->sc_aips);
+ wg_hashtable_destroy(&sc->sc_hashtable);
+
+ if (cred != NULL)
+ crfree(cred);
+ if_detach(sc->sc_ifp);
+ if_free(sc->sc_ifp);
+ /* Ensure any local/private keys are cleaned up */
+ explicit_bzero(sc, sizeof(*sc));
+ free(sc, M_WG);
+
+ atomic_add_int(&clone_count, -1);
+}
+
+static void
+wg_qflush(struct ifnet *ifp __unused)
+{
+}
+
+/*
+ * Privileged information (private-key, preshared-key) are only exported for
+ * root and jailed root by default.
+ */
+static bool
+wgc_privileged(struct wg_softc *sc)
+{
+ struct thread *td;
+
+ td = curthread;
+ return (priv_check(td, PRIV_NET_WG) == 0);
+}
+
+static void
+wg_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
+ char *unused __unused)
+{
+ struct wg_softc *sc;
+
+ sc = ifp->if_softc;
+ wg_down(sc);
+}
+
+static void
+wg_init(void *xsc)
+{
+ struct wg_softc *sc;
+
+ sc = xsc;
+ wg_up(sc);
+}
+
+static void
+vnet_wg_init(const void *unused __unused)
+{
+
+ V_wg_cloner = if_clone_simple(wgname, wg_clone_create, wg_clone_destroy,
+ 0);
+}
+VNET_SYSINIT(vnet_wg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ vnet_wg_init, NULL);
+
+static void
+vnet_wg_uninit(const void *unused __unused)
+{
+
+ if_clone_detach(V_wg_cloner);
+}
+VNET_SYSUNINIT(vnet_wg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
+ vnet_wg_uninit, NULL);
+
+static int
+wg_prison_remove(void *obj, void *data __unused)
+{
+ const struct prison *pr = obj;
+ struct wg_softc *sc;
+ struct ucred *cred;
+ bool dying;
+
+ /*
+ * Do a pass through all if_wg interfaces and release creds on any from
+ * the jail that are supposed to be going away. This will, in turn, let
+ * the jail die so that we don't end up with Schrödinger's jail.
+ */
+ sx_slock(&wg_sx);
+ LIST_FOREACH(sc, &wg_list, sc_entry) {
+ cred = NULL;
+
+ sx_xlock(&sc->sc_lock);
+ dying = (sc->sc_flags & WGF_DYING) != 0;
+ if (!dying && sc->sc_ucred != NULL &&
+ sc->sc_ucred->cr_prison == pr) {
+ /* Home jail is going away. */
+ cred = sc->sc_ucred;
+ sc->sc_ucred = NULL;
+
+ sc->sc_flags |= WGF_DYING;
+ }
+
+ /*
+ * If this is our foreign vnet going away, we'll also down the
+ * link and kill the socket because traffic needs to stop. Any
+ * address will be revoked in the rehoming process.
+ */
+ if (cred != NULL || (!dying &&
+ sc->sc_ifp->if_vnet == pr->pr_vnet)) {
+ if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN);
+ /* Have to kill the sockets, as they also hold refs. */
+ wg_socket_uninit(sc);
+ }
+
+ sx_xunlock(&sc->sc_lock);
+
+ if (cred != NULL) {
+ CURVNET_SET(sc->sc_ifp->if_vnet);
+ if_purgeaddrs(sc->sc_ifp);
+ CURVNET_RESTORE();
+ crfree(cred);
+ }
+ }
+ sx_sunlock(&wg_sx);
+
+ return (0);
+}
+
+static void
+wg_module_init(void)
+{
+ osd_method_t methods[PR_MAXMETHOD] = {
+ [PR_METHOD_REMOVE] = wg_prison_remove,
+ };
+
+ ratelimit_zone = uma_zcreate("wg ratelimit", sizeof(struct ratelimit),
+ NULL, NULL, NULL, NULL, 0, 0);
+ wg_osd_jail_slot = osd_jail_register(NULL, methods);
+}
+
+static void
+wg_module_deinit(void)
+{
+
+ uma_zdestroy(ratelimit_zone);
+ osd_jail_deregister(wg_osd_jail_slot);
+
+ MPASS(LIST_EMPTY(&wg_list));
+}
+
+static int
+wg_module_event_handler(module_t mod, int what, void *arg)
+{
+
+ switch (what) {
+ case MOD_LOAD:
+ wg_module_init();
+ break;
+ case MOD_UNLOAD:
+ if (atomic_load_int(&clone_count) == 0)
+ wg_module_deinit();
+ else
+ return (EBUSY);
+ break;
+ default:
+ return (EOPNOTSUPP);
+ }
+ return (0);
+}
+
+static moduledata_t wg_moduledata = {
+ "wg",
+ wg_module_event_handler,
+ NULL
+};
+
+DECLARE_MODULE(wg, wg_moduledata, SI_SUB_PSEUDO, SI_ORDER_ANY);
+MODULE_VERSION(wg, 1);
+MODULE_DEPEND(wg, crypto, 1, 1, 1);
diff --git a/sys/dev/if_wg/if_wg.h b/sys/dev/if_wg/if_wg.h
new file mode 100644
index 000000000000..2a100456d406
--- /dev/null
+++ b/sys/dev/if_wg/if_wg.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __IF_WG_H__
+#define __IF_WG_H__
+
+#include <net/if.h>
+#include <netinet/in.h>
+
+struct wg_data_io {
+ char wgd_name[IFNAMSIZ];
+ void *wgd_data;
+ size_t wgd_size;
+};
+
+#define WG_KEY_SIZE 32
+
+#define SIOCSWG _IOWR('i', 210, struct wg_data_io)
+#define SIOCGWG _IOWR('i', 211, struct wg_data_io)
+
+#endif /* __IF_WG_H__ */
diff --git a/sys/dev/if_wg/include/crypto/blake2s.h b/sys/dev/if_wg/include/crypto/blake2s.h
deleted file mode 100644
index 17e6447ebcd8..000000000000
--- a/sys/dev/if_wg/include/crypto/blake2s.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <sys/types.h>
-
-#ifndef _BLAKE2S_H_
-#define _BLAKE2S_H_
-
-
-enum blake2s_lengths {
- BLAKE2S_BLOCK_SIZE = 64,
- BLAKE2S_HASH_SIZE = 32,
- BLAKE2S_KEY_SIZE = 32
-};
-
-struct blake2s_state {
- uint32_t h[8];
- uint32_t t[2];
- uint32_t f[2];
- uint8_t buf[BLAKE2S_BLOCK_SIZE];
- size_t buflen;
- uint8_t last_node;
-};
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen);
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen);
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen);
-
-static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
-{
- struct blake2s_state state;
-#ifdef __linux___
- WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
- outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
- (!key && keylen)));
-#endif
-
- if (keylen)
- blake2s_init_key(&state, outlen, key, keylen);
- else
- blake2s_init(&state, outlen);
-
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out, outlen);
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen, const size_t keylen);
-
-#endif /* _BLAKE2S_H_ */
diff --git a/sys/dev/if_wg/include/crypto/curve25519.h b/sys/dev/if_wg/include/crypto/curve25519.h
deleted file mode 100644
index 3e90d1b270fe..000000000000
--- a/sys/dev/if_wg/include/crypto/curve25519.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _CURVE25519_H_
-#define _CURVE25519_H_
-
-#include <sys/systm.h>
-
-#define CURVE25519_KEY_SIZE 32
-
-void curve25519_generic(u8 [CURVE25519_KEY_SIZE],
- const u8 [CURVE25519_KEY_SIZE],
- const u8 [CURVE25519_KEY_SIZE]);
-
-static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
-{
- secret[0] &= 248;
- secret[31] = (secret[31] & 127) | 64;
-}
-
-static const u8 null_point[CURVE25519_KEY_SIZE] = { 0 };
-
-static inline int curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
- const u8 secret[CURVE25519_KEY_SIZE],
- const u8 basepoint[CURVE25519_KEY_SIZE])
-{
- curve25519_generic(mypublic, secret, basepoint);
- return timingsafe_bcmp(mypublic, null_point, CURVE25519_KEY_SIZE);
-}
-
-static inline int curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
- const u8 secret[CURVE25519_KEY_SIZE])
-{
- static const u8 basepoint[CURVE25519_KEY_SIZE] __aligned(32) = { 9 };
-
- if (timingsafe_bcmp(secret, null_point, CURVE25519_KEY_SIZE) == 0)
- return 0;
-
- return curve25519(pub, secret, basepoint);
-}
-
-static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
-{
- arc4random_buf(secret, CURVE25519_KEY_SIZE);
- curve25519_clamp_secret(secret);
-}
-
-#endif /* _CURVE25519_H_ */
diff --git a/sys/dev/if_wg/include/crypto/zinc.h b/sys/dev/if_wg/include/crypto/zinc.h
deleted file mode 100644
index 9aa1e8d59bf5..000000000000
--- a/sys/dev/if_wg/include/crypto/zinc.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _WG_ZINC_H
-#define _WG_ZINC_H
-
-int chacha20_mod_init(void);
-int poly1305_mod_init(void);
-int chacha20poly1305_mod_init(void);
-int blake2s_mod_init(void);
-int curve25519_mod_init(void);
-
-#endif
diff --git a/sys/dev/if_wg/include/sys/if_wg_session.h b/sys/dev/if_wg/include/sys/if_wg_session.h
deleted file mode 100644
index 45399e534364..000000000000
--- a/sys/dev/if_wg/include/sys/if_wg_session.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * $FreeBSD$
- */
-
-#ifndef __IF_WG_H__
-#define __IF_WG_H__
-
-#include <net/if.h>
-#include <netinet/in.h>
-
-/*
- * This is the public interface to the WireGuard network interface.
- *
- * It is designed to be used by tools such as ifconfig(8) and wg(4).
- */
-
-#define WG_KEY_SIZE 32
-
-#define WG_DEVICE_HAS_PUBKEY (1 << 0)
-#define WG_DEVICE_HAS_PRIVKEY (1 << 1)
-#define WG_DEVICE_HAS_MASKED_PRIVKEY (1 << 2)
-#define WG_DEVICE_HAS_PORT (1 << 3)
-#define WG_DEVICE_HAS_RDOMAIN (1 << 4)
-#define WG_DEVICE_REPLACE_PEERS (1 << 5)
-
-#define WG_PEER_HAS_PUBKEY (1 << 0)
-#define WG_PEER_HAS_SHAREDKEY (1 << 1)
-#define WG_PEER_HAS_MASKED_SHAREDKEY (1 << 2)
-#define WG_PEER_HAS_ENDPOINT (1 << 3)
-#define WG_PEER_HAS_PERSISTENTKEEPALIVE (1 << 4)
-#define WG_PEER_REPLACE_CIDRS (1 << 5)
-#define WG_PEER_REMOVE (1 << 6)
-
-#define SIOCSWG _IOWR('i', 200, struct wg_device_io)
-#define SIOCGWG _IOWR('i', 201, struct wg_device_io)
-
-#define WG_PEERS_FOREACH(p, d) \
- for (p = (d)->d_peers; p < (d)->d_peers + (d)->d_num_peers; p++)
-#define WG_CIDRS_FOREACH(c, p) \
- for (c = (p)->p_cidrs; c < (p)->p_cidrs + (p)->p_num_cidrs; c++)
-
-struct wg_allowedip {
- struct sockaddr_storage a_addr;
- struct sockaddr_storage a_mask;
-};
-
-enum {
- WG_PEER_CTR_TX_BYTES,
- WG_PEER_CTR_RX_BYTES,
- WG_PEER_CTR_NUM,
-};
-
-struct wg_device_io {
- char d_name[IFNAMSIZ];
- uint8_t d_flags;
- in_port_t d_port;
- int d_rdomain;
- uint8_t d_pubkey[WG_KEY_SIZE];
- uint8_t d_privkey[WG_KEY_SIZE];
- size_t d_num_peers;
- size_t d_num_cidrs;
- struct wg_peer_io *d_peers;
-};
-
-
-#ifndef ENOKEY
-#define ENOKEY ENOTCAPABLE
-#endif
-
-typedef enum {
- WGC_GET = 0x5,
- WGC_SET = 0x6,
-} wg_cmd_t;
-
-#endif /* __IF_WG_H__ */
diff --git a/sys/dev/if_wg/include/sys/if_wg_session_vars.h b/sys/dev/if_wg/include/sys/if_wg_session_vars.h
deleted file mode 100644
index 5fd85d3b7162..000000000000
--- a/sys/dev/if_wg/include/sys/if_wg_session_vars.h
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2019 Matt Dunwoodie <ncon@noconroy.net>
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * $FreeBSD$
- */
-
-#ifndef _IF_WG_VARS_H_
-#define _IF_WG_VARS_H_
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/time.h>
-
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <crypto/siphash/siphash.h>
-
-
-#include <net/if.h>
-#include <net/if_var.h>
-#include <net/if_types.h>
-#include <net/ethernet.h>
-#include <net/pfvar.h>
-#include <net/iflib.h>
-
-#include <sys/wg_noise.h>
-#include <sys/wg_cookie.h>
-/* This is only needed for wg_keypair. */
-#include <sys/if_wg_session.h>
-
-#define UNIMPLEMENTED() panic("%s not implemented\n", __func__)
-
-#define WG_KEY_SIZE 32
-#define WG_MSG_PADDING_SIZE 16
-
-
-/* Constant for session */
-#define REKEY_TIMEOUT 5
-#define REKEY_TIMEOUT_JITTER 500 /* TODO ok? jason */
-#define REJECT_AFTER_TIME 180
-#define KEEPALIVE_TIMEOUT 10
-#define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT)
-#define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT)
-
-#define MAX_QUEUED_INCOMING_HANDSHAKES 4096 /* TODO: replace this with DQL */
-#define MAX_QUEUED_PACKETS 1024 /* TODO: replace this with DQL */
-
-#define HASHTABLE_PEER_SIZE (1 << 6) //1 << 11
-#define HASHTABLE_INDEX_SIZE (HASHTABLE_PEER_SIZE * 3) //1 << 13
-
-#define PEER_MAGIC1 0xCAFEBABEB00FDADDULL
-#define PEER_MAGIC2 0xCAAFD0D0D00DBABEULL
-#define PEER_MAGIC3 0xD00DBABEF00DFADEULL
-
-
-enum message_type {
- MESSAGE_INVALID = 0,
- MESSAGE_HANDSHAKE_INITIATION = 1,
- MESSAGE_HANDSHAKE_RESPONSE = 2,
- MESSAGE_HANDSHAKE_COOKIE = 3,
- MESSAGE_DATA = 4
-};
-
-struct wg_softc;
-
-#if __FreeBSD_version > 1300000
-typedef void timeout_t (void *);
-#endif
-
-/* Socket */
-struct wg_endpoint {
- union wg_remote {
- struct sockaddr r_sa;
- struct sockaddr_in r_sin;
- struct sockaddr_in6 r_sin6;
- } e_remote;
- union wg_source {
- struct in_addr l_in;
- struct in6_pktinfo l_pktinfo6;
-#define l_in6 l_pktinfo6.ipi6_addr
- } e_local;
-};
-
-struct wg_socket {
- struct mtx so_mtx;
- in_port_t so_port;
- struct socket *so_so4;
- struct socket *so_so6;
-};
-
-struct wg_queue {
- struct mtx q_mtx;
- struct mbufq q;
-};
-
-struct wg_index {
- LIST_ENTRY(wg_index) i_entry;
- SLIST_ENTRY(wg_index) i_unused_entry;
- uint32_t i_key;
- struct noise_remote *i_value;
-};
-
-struct wg_timers {
- /* t_lock is for blocking wg_timers_event_* when setting t_disabled. */
- struct rwlock t_lock;
-
- int t_disabled;
- int t_need_another_keepalive;
- uint16_t t_persistent_keepalive_interval;
- struct callout t_new_handshake;
- struct callout t_send_keepalive;
- struct callout t_retry_handshake;
- struct callout t_zero_key_material;
- struct callout t_persistent_keepalive;
-
- struct mtx t_handshake_mtx;
- struct timespec t_handshake_last_sent;
- struct timespec t_handshake_complete;
- volatile int t_handshake_retries;
-
-};
-
-struct wg_peer {
- uint64_t p_magic_1;
- CK_LIST_ENTRY(wg_peer) p_hash_entry;
- CK_LIST_ENTRY(wg_peer) p_entry;
- uint64_t p_id;
- struct wg_softc *p_sc;
-
- struct noise_remote p_remote;
- struct cookie_maker p_cookie;
- struct wg_timers p_timers;
-
- struct rwlock p_endpoint_lock;
- struct wg_endpoint p_endpoint;
-
- uint64_t p_magic_2;
-
- SLIST_HEAD(,wg_index) p_unused_index;
- struct wg_index p_index[3];
-
- struct wg_queue p_encap_queue;
- struct wg_queue p_decap_queue;
-
- struct grouptask p_clear_secrets;
- struct grouptask p_send_initiation;
- struct grouptask p_send_keepalive;
- struct grouptask p_send;
- struct grouptask p_recv;
-
- counter_u64_t p_tx_bytes;
- counter_u64_t p_rx_bytes;
-
- CK_LIST_HEAD(, wg_route) p_routes;
- uint64_t p_magic_3;
- struct mtx p_lock;
- struct epoch_context p_ctx;
-};
-
-
-
-/* Packet */
-
-void wg_softc_decrypt(struct wg_softc *);
-void wg_softc_encrypt(struct wg_softc *);
-
-/* Queue */
-void wg_queue_init(struct wg_queue *, const char *);
-void wg_queue_deinit(struct wg_queue *);
-
-/* Counter */
-
-/* Timers */
-
-/* Route */
-enum route_direction {
- IN,
- OUT,
-};
-
-struct wg_route_table {
- size_t t_count;
- struct radix_node_head *t_ip;
- struct radix_node_head *t_ip6;
-};
-struct wg_peer;
-
-struct wg_route {
- struct radix_node r_nodes[2];
- struct wg_allowedip r_cidr;
- CK_LIST_ENTRY(wg_route) r_entry;
- struct wg_peer *r_peer;
-};
-
-
-int wg_route_add(struct wg_route_table *, struct wg_peer *,
- const struct wg_allowedip *);
-int wg_route_delete(struct wg_route_table *, struct wg_peer *);
-
-/* Noise */
-
-/*
- * Peer
- *
- *
- *
- */
-
-struct wg_softc;
-
-struct wg_hashtable {
- struct mtx h_mtx;
- SIPHASH_KEY h_secret;
- CK_LIST_HEAD(, wg_peer) h_peers_list;
- CK_LIST_HEAD(, wg_peer) *h_peers;
- u_long h_peers_mask;
- size_t h_num_peers;
- LIST_HEAD(, noise_keypair) *h_keys;
- u_long h_keys_mask;
- size_t h_num_keys;
-};
-
-/* Softc */
-struct wg_softc {
- if_softc_ctx_t shared;
- if_ctx_t wg_ctx;
- struct ifnet *sc_ifp;
- uint16_t sc_incoming_port;
- uint32_t sc_user_cookie;
-
- struct wg_socket sc_socket;
- struct wg_hashtable sc_hashtable;
- struct wg_route_table sc_routes;
-
- struct mbufq sc_handshake_queue;
- struct grouptask sc_handshake;
-
- struct noise_local sc_local;
- struct cookie_checker sc_cookie;
-
- struct buf_ring *sc_encap_ring;
- struct buf_ring *sc_decap_ring;
-
- struct grouptask *sc_encrypt;
- struct grouptask *sc_decrypt;
-
- struct rwlock sc_index_lock;
- LIST_HEAD(,wg_index) *sc_index;
- u_long sc_index_mask;
-
- struct mtx sc_mtx;
-};
-
-struct wg_tag {
- struct m_tag wt_tag;
- struct wg_endpoint t_endpoint;
- struct wg_peer *t_peer;
- struct mbuf *t_mbuf;
- sa_family_t t_family;
- int t_done;
- int t_mtu;
-};
-
-struct wg_peer *wg_route_lookup(struct wg_route_table *, struct mbuf *,
- enum route_direction);
-
-void wg_peer_remove_all(struct wg_softc *);
-struct wg_peer *wg_peer_alloc(struct wg_softc *);
-void wg_peer_destroy(struct wg_peer *);
-
-void wg_hashtable_init(struct wg_hashtable *);
-void wg_hashtable_destroy(struct wg_hashtable *);
-void wg_hashtable_peer_insert(struct wg_hashtable *, struct wg_peer *);
-struct wg_peer *wg_peer_lookup(struct wg_softc *,
- const uint8_t [WG_KEY_SIZE]);
-void wg_hashtable_peer_remove(struct wg_hashtable *, struct wg_peer *);
-
-
-int wg_queue_out(struct wg_peer *peer, struct mbuf *m);
-
-int wg_route_init(struct wg_route_table *);
-void wg_route_destroy(struct wg_route_table *);
-
-int wg_socket_init(struct wg_softc *sc);
-void wg_socket_reinit(struct wg_softc *, struct socket *so4,
- struct socket *so6);
-int wg_socket_close(struct wg_socket *so);
-
-void wg_softc_handshake_receive(struct wg_softc *sc);
-
-int wg_timers_get_persistent_keepalive(struct wg_timers *, uint16_t *);
-void wg_timers_set_persistent_keepalive(struct wg_timers *t, uint16_t);
-void wg_timers_get_last_handshake(struct wg_timers *, struct timespec *);
-
-
-struct noise_remote *wg_remote_get(struct wg_softc *, uint8_t [NOISE_KEY_SIZE]);
-uint32_t wg_index_set(struct wg_softc *, struct noise_remote *);
-struct noise_remote *wg_index_get(struct wg_softc *, uint32_t);
-void wg_index_drop(struct wg_softc *, uint32_t);
-void wg_encrypt_dispatch(struct wg_softc *);
-void wg_decrypt_dispatch(struct wg_softc *);
-
-struct wg_tag *wg_tag_get(struct mbuf *m);
-
-
-#endif /* _IF_WG_VARS_H_ */
diff --git a/sys/dev/if_wg/include/sys/simd-x86_64.h b/sys/dev/if_wg/include/sys/simd-x86_64.h
deleted file mode 100644
index 1453083aa273..000000000000
--- a/sys/dev/if_wg/include/sys/simd-x86_64.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef _SIMD_X86_64_H_
-#define _SIMD_X86_64_H_
-
-
-#include <x86/x86_var.h>
-#include <x86/specialreg.h>
-
-static inline uint64_t
-xgetbv(uint32_t index)
-{
- uint32_t eax, edx;
- /* xgetbv - instruction byte code */
- __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0"
- : "=a" (eax), "=d" (edx)
- : "c" (index));
-
- return ((((uint64_t)edx)<<32) | (uint64_t)eax);
-}
-
-
-/*
- * Detect register set support
- */
-static inline boolean_t
-__simd_state_enabled(const uint64_t state)
-{
- boolean_t has_osxsave;
- uint64_t xcr0;
-
- has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE);
-
- if (!has_osxsave)
- return (0);
-
- xcr0 = xgetbv(0);
- return ((xcr0 & state) == state);
-}
-
-#define _XSTATE_SSE_AVX (0x2 | 0x4)
-#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX)
-
-#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX)
-#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512)
-#endif
-
diff --git a/sys/dev/if_wg/include/sys/support.h b/sys/dev/if_wg/include/sys/support.h
deleted file mode 100644
index 7874fd9b1524..000000000000
--- a/sys/dev/if_wg/include/sys/support.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-
-#ifndef SYS_SUPPORT_H_
-#define SYS_SUPPORT_H_
-#ifdef __LOCORE
-#include <machine/asm.h>
-#define SYM_FUNC_START ENTRY
-#define SYM_FUNC_END END
-
-#else
-#include <sys/types.h>
-#include <sys/limits.h>
-#include <sys/endian.h>
-#include <sys/libkern.h>
-#include <sys/malloc.h>
-#include <sys/proc.h>
-#include <sys/lock.h>
-#include <vm/uma.h>
-
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
-#include <machine/fpu.h>
-#endif
-#include <crypto/siphash/siphash.h>
-
-
-#define COMPAT_ZINC_IS_A_MODULE
-MALLOC_DECLARE(M_WG);
-
-#define BUILD_BUG_ON(x) CTASSERT(!(x))
-
-#define BIT(nr) (1UL << (nr))
-#define BIT_ULL(nr) (1ULL << (nr))
-#ifdef __LP64__
-#define BITS_PER_LONG 64
-#else
-#define BITS_PER_LONG 32
-#endif
-
-#define rw_enter_write rw_wlock
-#define rw_exit_write rw_wunlock
-#define rw_enter_read rw_rlock
-#define rw_exit_read rw_runlock
-#define rw_exit rw_unlock
-
-#define ASSERT(x) MPASS(x)
-
-#define ___PASTE(a,b) a##b
-#define __PASTE(a,b) ___PASTE(a,b)
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#define typeof(x) __typeof__(x)
-
-
-#define min_t(t, a, b) ({ t __a = (a); t __b = (b); __a > __b ? __b : __a; })
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint32_t __le32;
-typedef uint64_t u64;
-typedef uint64_t __le64;
-
-#define __must_check __attribute__((__warn_unused_result__))
-#define asmlinkage
-#define __ro_after_init __read_mostly
-
-#define get_unaligned_le32(x) le32dec(x)
-#define get_unaligned_le64(x) le64dec(x)
-
-#define cpu_to_le64(x) htole64(x)
-#define cpu_to_le32(x) htole32(x)
-#define letoh64(x) le64toh(x)
-
-#define need_resched() \
- ((curthread->td_flags & (TDF_NEEDRESCHED|TDF_ASTPENDING)) || \
- curthread->td_owepreempt)
-
-
-#define CONTAINER_OF(a, b, c) __containerof((a), b, c)
-
-typedef struct {
- uint64_t k0;
- uint64_t k1;
-} SIPHASH_KEY;
-
-static inline uint64_t
-siphash24(const SIPHASH_KEY *key, const void *src, size_t len)
-{
- SIPHASH_CTX ctx;
-
- return (SipHashX(&ctx, 2, 4, (const uint8_t *)key, src, len));
-}
-
-static inline void
-put_unaligned_le32(u32 val, void *p)
-{
- *((__le32 *)p) = cpu_to_le32(val);
-}
-
-
-#define rol32(i32, n) ((i32) << (n) | (i32) >> (32 - (n)))
-
-#define memzero_explicit(p, s) explicit_bzero(p, s)
-
-#define EXPORT_SYMBOL(x)
-
-#define U32_MAX ((u32)~0U)
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
-#define kfpu_begin(ctx) { \
- if (ctx->sc_fpu_ctx == NULL) { \
- ctx->sc_fpu_ctx = fpu_kern_alloc_ctx(0); \
- } \
- critical_enter(); \
- fpu_kern_enter(curthread, ctx->sc_fpu_ctx, FPU_KERN_NORMAL); \
-}
-
-#define kfpu_end(ctx) { \
- MPASS(ctx->sc_fpu_ctx != NULL); \
- fpu_kern_leave(curthread, ctx->sc_fpu_ctx); \
- critical_exit(); \
-}
-#else
-#define kfpu_begin(ctx)
-#define kfpu_end(ctx)
-#define fpu_kern_free_ctx(p)
-#endif
-
-typedef enum {
- HAVE_NO_SIMD = 1 << 0,
- HAVE_FULL_SIMD = 1 << 1,
- HAVE_SIMD_IN_USE = 1 << 31
-} simd_context_state_t;
-
-typedef struct {
- simd_context_state_t sc_state;
- struct fpu_kern_ctx *sc_fpu_ctx;
-} simd_context_t;
-
-
-#define DONT_USE_SIMD NULL
-
-static __must_check inline bool
-may_use_simd(void)
-{
-#if defined(__amd64__)
- return true;
-#else
- return false;
-#endif
-}
-
-static inline void
-simd_get(simd_context_t *ctx)
-{
- ctx->sc_state = may_use_simd() ? HAVE_FULL_SIMD : HAVE_NO_SIMD;
-}
-
-static inline void
-simd_put(simd_context_t *ctx)
-{
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
- if (is_fpu_kern_thread(0))
- return;
-#endif
- if (ctx->sc_state & HAVE_SIMD_IN_USE)
- kfpu_end(ctx);
- ctx->sc_state = HAVE_NO_SIMD;
-}
-
-static __must_check inline bool
-simd_use(simd_context_t *ctx)
-{
-#if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
- if (is_fpu_kern_thread(0))
- return true;
-#else
- return false;
-#endif
- if (ctx == NULL)
- return false;
- if (!(ctx->sc_state & HAVE_FULL_SIMD))
- return false;
- if (ctx->sc_state & HAVE_SIMD_IN_USE)
- return true;
- kfpu_begin(ctx);
- ctx->sc_state |= HAVE_SIMD_IN_USE;
- return true;
-}
-
-static inline bool
-simd_relax(simd_context_t *ctx)
-{
- if ((ctx->sc_state & HAVE_SIMD_IN_USE) && need_resched()) {
- simd_put(ctx);
- simd_get(ctx);
- return simd_use(ctx);
- }
- return false;
-}
-
-#define unlikely(x) __predict_false(x)
-#define likely(x) __predict_true(x)
-/* Generic path for arbitrary size */
-
-
-static inline unsigned long
-__crypto_memneq_generic(const void *a, const void *b, size_t size)
-{
- unsigned long neq = 0;
-
- while (size >= sizeof(unsigned long)) {
- neq |= *(const unsigned long *)a ^ *(const unsigned long *)b;
- __compiler_membar();
- a = ((const char *)a + sizeof(unsigned long));
- b = ((const char *)b + sizeof(unsigned long));
- size -= sizeof(unsigned long);
- }
- while (size > 0) {
- neq |= *(const unsigned char *)a ^ *(const unsigned char *)b;
- __compiler_membar();
- a = (const char *)a + 1;
- b = (const char *)b + 1;
- size -= 1;
- }
- return neq;
-}
-
-#define crypto_memneq(a, b, c) __crypto_memneq_generic((a), (b), (c))
-
-static inline void
-__cpu_to_le32s(uint32_t *buf)
-{
- *buf = htole32(*buf);
-}
-
-static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
-{
- while (words--) {
- __cpu_to_le32s(buf);
- buf++;
- }
-}
-
-#define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
-void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len);
-
-static inline void crypto_xor_cpy(u8 *dst, const u8 *src1, const u8 *src2,
- unsigned int size)
-{
- if (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS &&
- __builtin_constant_p(size) &&
- (size % sizeof(unsigned long)) == 0) {
- unsigned long *d = (unsigned long *)dst;
- const unsigned long *s1 = (const unsigned long *)src1;
- const unsigned long *s2 = (const unsigned long *)src2;
-
- while (size > 0) {
- *d++ = *s1++ ^ *s2++;
- size -= sizeof(unsigned long);
- }
- } else {
- __crypto_xor(dst, src1, src2, size);
- }
-}
-#include <sys/kernel.h>
-#define module_init(fn) \
-static void \
-wrap_ ## fn(void *dummy __unused) \
-{ \
- fn(); \
-} \
-SYSINIT(if_wg_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL)
-
-
-#define module_exit(fn) \
-static void \
-wrap_ ## fn(void *dummy __unused) \
-{ \
- fn(); \
-} \
-SYSUNINIT(if_wg_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL)
-
-#define module_param(a, b, c)
-#define MODULE_LICENSE(x)
-#define MODULE_DESCRIPTION(x)
-#define MODULE_AUTHOR(x)
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define __initconst
-#define __initdata
-#define __init
-#define __exit
-#define BUG() panic("%s:%d bug hit!\n", __FILE__, __LINE__)
-
-#define WARN_ON(cond) ({ \
- bool __ret = (cond); \
- if (__ret) { \
- printf("WARNING %s failed at %s:%d\n", \
- __stringify(cond), __FILE__, __LINE__); \
- } \
- unlikely(__ret); \
-})
-
-#define pr_err printf
-#define pr_info printf
-#define IS_ENABLED(x) 0
-#define ___stringify(...) #__VA_ARGS__
-#define __stringify(...) ___stringify(__VA_ARGS__)
-#define kmalloc(size, flag) malloc((size), M_WG, M_WAITOK)
-#define kfree(p) free(p, M_WG)
-#define vzalloc(size) malloc((size), M_WG, M_WAITOK|M_ZERO)
-#define vfree(p) free(p, M_WG)
-#endif
-#endif
diff --git a/sys/dev/if_wg/include/sys/wg_module.h b/sys/dev/if_wg/include/sys/wg_module.h
deleted file mode 100644
index cc662104d640..000000000000
--- a/sys/dev/if_wg/include/sys/wg_module.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*-
- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
- *
- * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * $FreeBSD$
- */
-#ifndef MODULE_H_
-#define MODULE_H_
-
-#include <sys/mbuf.h>
-#include <sys/socket.h>
-#include <net/if.h>
-#include <net/if_var.h>
-#include <sys/support.h>
-
-
-#include <sys/types.h>
-#include <sys/epoch.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-
-
-
-#include <crypto/curve25519.h>
-#include <zinc/chacha20poly1305.h>
-#include <crypto/blake2s.h>
-
-
-enum noise_lengths {
- NOISE_PUBLIC_KEY_LEN = CURVE25519_KEY_SIZE,
- NOISE_SYMMETRIC_KEY_LEN = CHACHA20POLY1305_KEY_SIZE,
- NOISE_TIMESTAMP_LEN = sizeof(uint64_t) + sizeof(uint32_t),
- NOISE_AUTHTAG_LEN = CHACHA20POLY1305_AUTHTAG_SIZE,
- NOISE_HASH_LEN = BLAKE2S_HASH_SIZE
-};
-
-#define noise_encrypted_len(plain_len) ((plain_len) + NOISE_AUTHTAG_LEN)
-
-enum cookie_values {
- COOKIE_SECRET_MAX_AGE = 2 * 60,
- COOKIE_SECRET_LATENCY = 5,
- COOKIE_NONCE_LEN = XCHACHA20POLY1305_NONCE_SIZE,
- COOKIE_LEN = 16
-};
-
-enum limits {
- REKEY_TIMEOUT = 5,
- INITIATIONS_PER_SECOND = 50,
- MAX_PEERS_PER_DEVICE = 1U << 20,
- KEEPALIVE_TIMEOUT = 10,
- MAX_TIMER_HANDSHAKES = 90 / REKEY_TIMEOUT,
- MAX_QUEUED_INCOMING_HANDSHAKES = 4096, /* TODO: replace this with DQL */
- MAX_STAGED_PACKETS = 128,
- MAX_QUEUED_PACKETS = 1024 /* TODO: replace this with DQL */
-};
-
-#define zfree(addr, type) \
- do { \
- explicit_bzero(addr, sizeof(*addr)); \
- free(addr, type); \
- } while (0)
-
-struct crypt_queue {
- union {
- struct {
- int last_cpu;
- };
- };
-};
-
-#define __ATOMIC_LOAD_SIZE \
- ({ \
- switch (size) { \
- case 1: *(uint8_t *)res = *(volatile uint8_t *)p; break; \
- case 2: *(uint16_t *)res = *(volatile uint16_t *)p; break; \
- case 4: *(uint32_t *)res = *(volatile uint32_t *)p; break; \
- case 8: *(uint64_t *)res = *(volatile uint64_t *)p; break; \
- } \
-})
-
-static inline void
-__atomic_load_acq_size(volatile void *p, void *res, int size)
-{
- __ATOMIC_LOAD_SIZE;
-}
-
-#define atomic_load_acq(x) \
- ({ \
- union { __typeof(x) __val; char __c[1]; } __u; \
- __atomic_load_acq_size(&(x), __u.__c, sizeof(x)); \
- __u.__val; \
-})
-
-
-int wg_ctx_init(void);
-void wg_ctx_uninit(void);
-
-
-#endif
diff --git a/sys/dev/if_wg/include/sys/wg_noise.h b/sys/dev/if_wg/include/sys/wg_noise.h
deleted file mode 100644
index 40bdab515bc7..000000000000
--- a/sys/dev/if_wg/include/sys/wg_noise.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (C) 2015-2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2019-2020 Matt Dunwoodie <ncon@noconroy.net>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * ======== wg_noise.h ========
- *
- * This file provides a thread safe interface to the Noise protocol as used in
- * WireGuard. The three user facing components are:
- *
- * * noise_local
- * Stores the local state for a noise peer.
- * * noise_remote
- * Stores the remote state for a noise peer.
- * * noise_upcall
- * Stores callback routines for index and peers
- *
- * Additionally a noise_counter, which is invsible to the user is used to track
- * message nonces, to prevent message replay.
- *
- * This module uses Curve25519 for asymmetric crypto, and ChaCha20Poly1305 for
- * symmetric crypto. The handshake uses ephemeral keys, which provide perfect
- * forward secrecy. Keys are NOISE_KEY_SIZE (32) bytes long and can be
- * generated with a CSRNG. While this module will clamp the key to form a valid
- * Curve25519 key, it is recommended that keys are stored in Curve25519 form to
- * preserve interoperability with other systems. Additionally, there is an
- * optional PresharedKey of length NOISE_PSK_SIZE (also 32 bytes), which when
- * used, will provide protection against known quantum attacks. Without it,
- * Curve25519 is broken by Shor's algorithm.
- *
- * -------- noise_local --------
- *
- * void noise_local_init(noise_local *, noise_upcall *)
- * - Initialise noise_local, should only be called once and before use.
- *
- * int noise_local_set_private(noise_local *, uint8_t *private)
- * - Set the local private key. This will also calculate the corresponding
- * public key.
- *
- * int noise_local_keys(noise_local *, uint8_t *public, uint8_t *private)
- * - Get the local keys. It will ensure that a key has been set and if
- * not, will return ENXIO.
- *
- * -------- noise_remote --------
- *
- * void noise_remote_init(noise_remote *, uint8_t *public)
- * - Initialise noise_local, should only be called once and before use. Key
- * must be provided and it cannot be changed once set.
- *
- * void noise_remote_set_psk(noise_remote *, uint8_t *psk)
- * - Set the shared key. To remove the shared key, set a key of all 0x00.
- *
- * void noise_remote_keys(noise_remote *, uint8_t *public, uint8_t *psk)
- * - Get the remote keys.
- *
- * -------- noise_upcall --------
- *
- * The noise_upcall struct is used to lookup incoming public keys, as well as
- * allocate and deallocate index for a remote. The allocation and deallocation
- * are serialised per noise_remote and guaranteed to only have 3 allocated
- * indexes at once.
- *
- * u_arg - passed to callback functions as void *
- * u_get_remote - lookup noise_remote based on public key.
- * u_set_index - allocate index for noise_remote. any further packets that
- * arrive with this index should be passed to noise_* functions
- * with the corresponding noise_remote.
- * u_drop_index - dealloate index passed to callback.
- *
- * -------- crypto --------
- *
- * The following functions are used for the crypto side of things:
- *
- * int noise_create_initiation(noise_remote *, noise_initiation *)
- * int noise_consume_initiation(noise_local *, noise_remote **, noise_initiation *)
- * int noise_create_response(noise_remote *, noise_response *)
- * int noise_consume_response(noise_remote *, noise_response *)
- *
- * int noise_remote_promote(noise_remote *)
- * void noise_remote_clear(noise_remote *)
- * void noise_remote_expire_current(noise_remote *)
- * int noise_remote_encrypt(noise_remote *, noise_data *, size_t)
- * int noise_remote_decrypt(noise_remote *, noise_data *, size_t)
- *
- * $FreeBSD$
- */
-
-#ifndef __NOISE_H__
-#define __NOISE_H__
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/rwlock.h>
-#include <sys/support.h>
-
-#include <crypto/blake2s.h>
-#include <zinc/chacha20poly1305.h>
-#include <crypto/curve25519.h>
-
-#define NOISE_KEY_SIZE CURVE25519_KEY_SIZE
-#define NOISE_PSK_SIZE 32
-#define NOISE_MAC_SIZE CHACHA20POLY1305_AUTHTAG_SIZE
-#define NOISE_HASH_SIZE BLAKE2S_HASH_SIZE
-#define NOISE_SYMMETRIC_SIZE CHACHA20POLY1305_KEY_SIZE
-#define NOISE_TIMESTAMP_SIZE 12
-
-/* Protocol string constants */
-#define NOISE_HANDSHAKE_NAME "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s"
-#define NOISE_IDENTIFIER_NAME "WireGuard v1 zx2c4 Jason@zx2c4.com"
-
-/* Constants for the counter */
-#define COUNTER_TYPE size_t
-#define COUNTER_BITS_TOTAL 512
-#define COUNTER_TYPE_BITS (sizeof(COUNTER_TYPE) * 8)
-#define COUNTER_TYPE_NUM (COUNTER_BITS_TOTAL / COUNTER_TYPE_BITS)
-#define COUNTER_WINDOW_SIZE (COUNTER_BITS_TOTAL - COUNTER_TYPE_BITS)
-
-/* Constants for the keypair */
-#define REKEY_AFTER_MESSAGES (1ull << 60)
-#define REJECT_AFTER_MESSAGES (UINT64_MAX - COUNTER_WINDOW_SIZE - 1)
-#define REKEY_AFTER_TIME 120
-#define REKEY_AFTER_TIME_RECV 165
-#define REJECT_AFTER_TIME 180
-#define REJECT_INTERVAL (1000000000 / 50) /* fifty times per sec */
-/* 24 = floor(log2(REJECT_INTERVAL)) */
-#define REJECT_INTERVAL_MASK (~((1ull<<24)-1))
-
-enum noise_state_hs {
- HS_ZEROED = 0,
- CREATED_INITIATION,
- CONSUMED_INITIATION,
- CREATED_RESPONSE,
- CONSUMED_RESPONSE,
-};
-
-struct noise_handshake {
- enum noise_state_hs hs_state;
- uint32_t hs_local_index;
- uint32_t hs_remote_index;
- uint8_t hs_e[NOISE_KEY_SIZE];
- uint8_t hs_hash[NOISE_HASH_SIZE];
- uint8_t hs_ck[NOISE_HASH_SIZE];
-};
-
-struct noise_counter {
- struct rwlock c_lock;
- uint64_t c_send;
- uint64_t c_recv;
- COUNTER_TYPE c_backtrack[COUNTER_TYPE_NUM];
-};
-
-enum noise_state_kp {
- KP_ZEROED = 0,
- INITIATOR,
- RESPONDER,
-};
-
-struct noise_keypair {
- SLIST_ENTRY(noise_keypair) kp_entry;
- int kp_valid;
- int kp_is_initiator;
- uint32_t kp_local_index;
- uint32_t kp_remote_index;
- uint8_t kp_send[NOISE_SYMMETRIC_SIZE];
- uint8_t kp_recv[NOISE_SYMMETRIC_SIZE];
- struct timespec kp_birthdate; /* nanouptime */
- struct noise_counter kp_ctr;
-};
-
-struct noise_remote {
- uint8_t r_public[NOISE_KEY_SIZE];
- struct noise_local *r_local;
- uint8_t r_ss[NOISE_KEY_SIZE];
-
- struct rwlock r_handshake_lock;
- struct noise_handshake r_handshake;
- uint8_t r_psk[NOISE_PSK_SIZE];
- uint8_t r_timestamp[NOISE_TIMESTAMP_SIZE];
- struct timespec r_last_init; /* nanouptime */
-
- struct rwlock r_keypair_lock;
- SLIST_HEAD(,noise_keypair) r_unused_keypairs;
- struct noise_keypair *r_next, *r_current, *r_previous;
- struct noise_keypair r_keypair[3]; /* 3: next, current, previous. */
-
-};
-
-struct noise_local {
- struct rwlock l_identity_lock;
- int l_has_identity;
- uint8_t l_public[NOISE_KEY_SIZE];
- uint8_t l_private[NOISE_KEY_SIZE];
-
- struct noise_upcall {
- void *u_arg;
- struct noise_remote *
- (*u_remote_get)(void *, uint8_t[NOISE_KEY_SIZE]);
- uint32_t
- (*u_index_set)(void *, struct noise_remote *);
- void (*u_index_drop)(void *, uint32_t);
- } l_upcall;
-};
-
-struct noise_initiation {
- uint32_t s_idx;
- uint8_t ue[NOISE_KEY_SIZE];
- uint8_t es[NOISE_KEY_SIZE + NOISE_MAC_SIZE];
- uint8_t ets[NOISE_TIMESTAMP_SIZE + NOISE_MAC_SIZE];
-} __packed;
-
-struct noise_response {
- uint32_t s_idx;
- uint32_t r_idx;
- uint8_t ue[NOISE_KEY_SIZE];
- uint8_t en[0 + NOISE_MAC_SIZE];
-} __packed;
-
-struct noise_data {
- uint32_t r_idx;
- uint64_t nonce;
- uint8_t buf[];
-} __packed;
-
-
-/* Set/Get noise parameters */
-void noise_local_init(struct noise_local *, struct noise_upcall *);
-void noise_local_lock_identity(struct noise_local *);
-void noise_local_unlock_identity(struct noise_local *);
-int noise_local_set_private(struct noise_local *, uint8_t[NOISE_KEY_SIZE]);
-int noise_local_keys(struct noise_local *, uint8_t[NOISE_KEY_SIZE],
- uint8_t[NOISE_KEY_SIZE]);
-
-void noise_remote_init(struct noise_remote *, const uint8_t[NOISE_KEY_SIZE],
- struct noise_local *);
-int noise_remote_set_psk(struct noise_remote *, const uint8_t[NOISE_PSK_SIZE]);
-int noise_remote_keys(struct noise_remote *, uint8_t[NOISE_KEY_SIZE],
- uint8_t[NOISE_PSK_SIZE]);
-
-/* Should be called anytime noise_local_set_private is called */
-void noise_remote_precompute(struct noise_remote *);
-
-/* Cryptographic functions */
-int noise_create_initiation(
- struct noise_remote *,
- struct noise_initiation *);
-
-int noise_consume_initiation(
- struct noise_local *,
- struct noise_remote **,
- struct noise_initiation *);
-
-int noise_create_response(
- struct noise_remote *,
- struct noise_response *);
-
-int noise_consume_response(
- struct noise_remote *,
- struct noise_response *);
-
- int noise_remote_begin_session(struct noise_remote *);
-void noise_remote_clear(struct noise_remote *);
-void noise_remote_expire_current(struct noise_remote *);
-
-int noise_remote_ready(struct noise_remote *);
-
-int noise_remote_encrypt(
- struct noise_remote *,
- struct noise_data *,
- size_t);
-int noise_remote_decrypt(
- struct noise_remote *,
- struct noise_data *,
- size_t);
-
-#endif /* __NOISE_H__ */
diff --git a/sys/dev/if_wg/include/zinc/blake2s.h b/sys/dev/if_wg/include/zinc/blake2s.h
deleted file mode 100644
index e87bfdbc9f6d..000000000000
--- a/sys/dev/if_wg/include/zinc/blake2s.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_BLAKE2S_H
-#define _ZINC_BLAKE2S_H
-
-#include <sys/types.h>
-
-enum blake2s_lengths {
- BLAKE2S_BLOCK_SIZE = 64,
- BLAKE2S_HASH_SIZE = 32,
- BLAKE2S_KEY_SIZE = 32
-};
-
-struct blake2s_state {
- uint32_t h[8];
- uint32_t t[2];
- uint32_t f[2];
- uint8_t buf[BLAKE2S_BLOCK_SIZE];
- unsigned int buflen;
- unsigned int outlen;
-};
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen);
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen);
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
-//void blake2s_final(struct blake2s_state *state, uint8_t *out);
-
-static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
-{
- struct blake2s_state state;
-
- if (keylen)
- blake2s_init_key(&state, outlen, key, keylen);
- else
- blake2s_init(&state, outlen);
-
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out);
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen,
- const size_t inlen, const size_t keylen);
-
-#endif /* _ZINC_BLAKE2S_H */
diff --git a/sys/dev/if_wg/include/zinc/chacha20.h b/sys/dev/if_wg/include/zinc/chacha20.h
deleted file mode 100644
index 1a9524bdfe85..000000000000
--- a/sys/dev/if_wg/include/zinc/chacha20.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_CHACHA20_H
-#define _ZINC_CHACHA20_H
-
-#include <sys/param.h>
-#include <sys/support.h>
-
-enum chacha20_lengths {
- CHACHA20_NONCE_SIZE = 16,
- CHACHA20_KEY_SIZE = 32,
- CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(u32),
- CHACHA20_BLOCK_SIZE = 64,
- CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(u32),
- HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE,
- HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE
-};
-
-enum chacha20_constants { /* expand 32-byte k */
- CHACHA20_CONSTANT_EXPA = 0x61707865U,
- CHACHA20_CONSTANT_ND_3 = 0x3320646eU,
- CHACHA20_CONSTANT_2_BY = 0x79622d32U,
- CHACHA20_CONSTANT_TE_K = 0x6b206574U
-};
-
-struct chacha20_ctx {
- union {
- u32 state[16];
- struct {
- u32 constant[4];
- u32 key[8];
- u32 counter[4];
- };
- };
-};
-
-static inline void chacha20_init(struct chacha20_ctx *ctx,
- const u8 key[CHACHA20_KEY_SIZE],
- const u64 nonce)
-{
- ctx->constant[0] = CHACHA20_CONSTANT_EXPA;
- ctx->constant[1] = CHACHA20_CONSTANT_ND_3;
- ctx->constant[2] = CHACHA20_CONSTANT_2_BY;
- ctx->constant[3] = CHACHA20_CONSTANT_TE_K;
- ctx->key[0] = get_unaligned_le32(key + 0);
- ctx->key[1] = get_unaligned_le32(key + 4);
- ctx->key[2] = get_unaligned_le32(key + 8);
- ctx->key[3] = get_unaligned_le32(key + 12);
- ctx->key[4] = get_unaligned_le32(key + 16);
- ctx->key[5] = get_unaligned_le32(key + 20);
- ctx->key[6] = get_unaligned_le32(key + 24);
- ctx->key[7] = get_unaligned_le32(key + 28);
- ctx->counter[0] = 0;
- ctx->counter[1] = 0;
- ctx->counter[2] = nonce & U32_MAX;
- ctx->counter[3] = nonce >> 32;
-}
-void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len,
- simd_context_t *simd_context);
-
-void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context);
-
-#endif /* _ZINC_CHACHA20_H */
diff --git a/sys/dev/if_wg/include/zinc/chacha20poly1305.h b/sys/dev/if_wg/include/zinc/chacha20poly1305.h
deleted file mode 100644
index 2d18b0fc3e82..000000000000
--- a/sys/dev/if_wg/include/zinc/chacha20poly1305.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_CHACHA20POLY1305_H
-#define _ZINC_CHACHA20POLY1305_H
-
-#include <sys/types.h>
-
-struct scatterlist;
-
-enum chacha20poly1305_lengths {
- XCHACHA20POLY1305_NONCE_SIZE = 24,
- CHACHA20POLY1305_KEY_SIZE = 32,
- CHACHA20POLY1305_AUTHTAG_SIZE = 16
-};
-
-void chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
- const uint8_t *ad, const size_t ad_len,
- const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-bool chacha20poly1305_encrypt_sg_inplace(
- struct scatterlist *src, const size_t src_len, const uint8_t *ad,
- const size_t ad_len, const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE], simd_context_t *simd_context);
-
-bool chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
- const uint8_t *ad, const size_t ad_len, const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-bool chacha20poly1305_decrypt_sg_inplace(
- struct scatterlist *src, size_t src_len, const uint8_t *ad,
- const size_t ad_len, const uint64_t nonce,
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE], simd_context_t *simd_context);
-
-void xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len,
- const uint8_t *ad, const size_t ad_len,
- const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-bool xchacha20poly1305_decrypt(
- uint8_t *dst, const uint8_t *src, const size_t src_len, const uint8_t *ad,
- const size_t ad_len, const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const uint8_t key[CHACHA20POLY1305_KEY_SIZE]);
-
-#endif /* _ZINC_CHACHA20POLY1305_H */
diff --git a/sys/dev/if_wg/include/zinc/curve25519.h b/sys/dev/if_wg/include/zinc/curve25519.h
deleted file mode 100644
index aa32359462da..000000000000
--- a/sys/dev/if_wg/include/zinc/curve25519.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_CURVE25519_H
-#define _ZINC_CURVE25519_H
-
-#include <sys/types.h>
-
-enum curve25519_lengths {
- CURVE25519_KEY_SIZE = 32
-};
-
-bool curve25519(uint8_t mypublic[CURVE25519_KEY_SIZE],
- const uint8_t secret[CURVE25519_KEY_SIZE],
- const uint8_t basepoint[CURVE25519_KEY_SIZE]);
-void curve25519_generate_secret(uint8_t secret[CURVE25519_KEY_SIZE]);
-bool curve25519_generate_public(
- uint8_t pub[CURVE25519_KEY_SIZE], const uint8_t secret[CURVE25519_KEY_SIZE]);
-
-static inline void curve25519_clamp_secret(uint8_t secret[CURVE25519_KEY_SIZE])
-{
- secret[0] &= 248;
- secret[31] = (secret[31] & 127) | 64;
-}
-
-#endif /* _ZINC_CURVE25519_H */
diff --git a/sys/dev/if_wg/include/zinc/poly1305.h b/sys/dev/if_wg/include/zinc/poly1305.h
deleted file mode 100644
index ca4cc60b41b3..000000000000
--- a/sys/dev/if_wg/include/zinc/poly1305.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#ifndef _ZINC_POLY1305_H
-#define _ZINC_POLY1305_H
-
-
-enum poly1305_lengths {
- POLY1305_BLOCK_SIZE = 16,
- POLY1305_KEY_SIZE = 32,
- POLY1305_MAC_SIZE = 16
-};
-
-struct poly1305_ctx {
- u8 opaque[24 * sizeof(u64)];
- u32 nonce[4];
- u8 data[POLY1305_BLOCK_SIZE];
- size_t num;
-} __aligned(8);
-
-void poly1305_init(struct poly1305_ctx *ctx, const u8 key[POLY1305_KEY_SIZE]);
-void poly1305_update(struct poly1305_ctx *ctx, const u8 *input, size_t len,
- simd_context_t *simd_context);
-void poly1305_final(struct poly1305_ctx *ctx, u8 mac[POLY1305_MAC_SIZE],
- simd_context_t *simd_context);
-
-#endif /* _ZINC_POLY1305_H */
diff --git a/sys/dev/if_wg/module/blake2s.c b/sys/dev/if_wg/module/blake2s.c
deleted file mode 100644
index a362a6b350f1..000000000000
--- a/sys/dev/if_wg/module/blake2s.c
+++ /dev/null
@@ -1,256 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2012 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the BLAKE2s hash and PRF functions.
- *
- * Information: https://blake2.net/
- *
- */
-
-#include <sys/types.h>
-#include <sys/systm.h>
-#include <sys/endian.h>
-
-#include <crypto/blake2s.h>
-
-static inline uint32_t
-ror32(uint32_t word, unsigned int shift)
-{
- return (word >> shift) | (word << (32 - shift));
-}
-
-typedef union {
- struct {
- uint8_t digest_length;
- uint8_t key_length;
- uint8_t fanout;
- uint8_t depth;
- uint32_t leaf_length;
- uint32_t node_offset;
- uint16_t xof_length;
- uint8_t node_depth;
- uint8_t inner_length;
- uint8_t salt[8];
- uint8_t personal[8];
- };
- uint32_t words[8];
-} __packed blake2s_param;
-
-static const uint32_t blake2s_iv[8] = {
- 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
- 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
-};
-
-static const uint8_t blake2s_sigma[10][16] = {
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
- { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
- { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
- { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
- { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
- { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
- { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
- { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
- { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-};
-
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
-{
- if (state->last_node)
- state->f[1] = -1;
- state->f[0] = -1;
-}
-
-static inline void blake2s_increment_counter(struct blake2s_state *state,
- const uint32_t inc)
-{
- state->t[0] += inc;
- state->t[1] += (state->t[0] < inc);
-}
-
-static inline void blake2s_init_param(struct blake2s_state *state,
- const blake2s_param *param)
-{
- int i;
-
- memset(state, 0, sizeof(*state));
- for (i = 0; i < 8; ++i)
- state->h[i] = blake2s_iv[i] ^ le32toh(param->words[i]);
-}
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen)
-{
- blake2s_param param __aligned(__alignof__(uint32_t)) = {
- .digest_length = outlen,
- .fanout = 1,
- .depth = 1
- };
-
- /*WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE));*/
- blake2s_init_param(state, &param);
-}
-
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen)
-{
- blake2s_param param = { .digest_length = outlen,
- .key_length = keylen,
- .fanout = 1,
- .depth = 1 };
- uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 };
-
- /*WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
- !key || !keylen || keylen > BLAKE2S_KEY_SIZE));*/
- blake2s_init_param(state, &param);
- memcpy(block, key, keylen);
- blake2s_update(state, block, BLAKE2S_BLOCK_SIZE);
- explicit_bzero(block, BLAKE2S_BLOCK_SIZE);
-}
-
-static inline void blake2s_compress(struct blake2s_state *state,
- const uint8_t *block, size_t nblocks,
- const uint32_t inc)
-{
- uint32_t m[16];
- uint32_t v[16];
- int i;
-
- /*WARN_ON(IS_ENABLED(DEBUG) &&
- (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));*/
-
- while (nblocks > 0) {
- blake2s_increment_counter(state, inc);
- memcpy(m, block, BLAKE2S_BLOCK_SIZE);
- for(i = 0; i < (sizeof(m)/sizeof(m[0])); i++)
- (m[i]) = le32toh((m[i]));
- memcpy(v, state->h, 32);
- v[ 8] = blake2s_iv[0];
- v[ 9] = blake2s_iv[1];
- v[10] = blake2s_iv[2];
- v[11] = blake2s_iv[3];
- v[12] = blake2s_iv[4] ^ state->t[0];
- v[13] = blake2s_iv[5] ^ state->t[1];
- v[14] = blake2s_iv[6] ^ state->f[0];
- v[15] = blake2s_iv[7] ^ state->f[1];
-
-#define G(r, i, a, b, c, d) do { \
- a += b + m[blake2s_sigma[r][2 * i + 0]]; \
- d = ror32(d ^ a, 16); \
- c += d; \
- b = ror32(b ^ c, 12); \
- a += b + m[blake2s_sigma[r][2 * i + 1]]; \
- d = ror32(d ^ a, 8); \
- c += d; \
- b = ror32(b ^ c, 7); \
-} while (0)
-
-#define ROUND(r) do { \
- G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
- G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
- G(r, 2, v[2], v[ 6], v[10], v[14]); \
- G(r, 3, v[3], v[ 7], v[11], v[15]); \
- G(r, 4, v[0], v[ 5], v[10], v[15]); \
- G(r, 5, v[1], v[ 6], v[11], v[12]); \
- G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
- G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
-} while (0)
- ROUND(0);
- ROUND(1);
- ROUND(2);
- ROUND(3);
- ROUND(4);
- ROUND(5);
- ROUND(6);
- ROUND(7);
- ROUND(8);
- ROUND(9);
-
-#undef G
-#undef ROUND
-
- for (i = 0; i < 8; ++i)
- state->h[i] ^= v[i] ^ v[i + 8];
-
- block += BLAKE2S_BLOCK_SIZE;
- --nblocks;
- }
-}
-
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen)
-{
- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
-
- if (!inlen)
- return;
- if (inlen > fill) {
- memcpy(state->buf + state->buflen, in, fill);
- blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
- state->buflen = 0;
- in += fill;
- inlen -= fill;
- }
- if (inlen > BLAKE2S_BLOCK_SIZE) {
- const size_t nblocks =
- (inlen + BLAKE2S_BLOCK_SIZE - 1) / BLAKE2S_BLOCK_SIZE;
- /* Hash one less (full) block than strictly possible */
- blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
- in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- }
- memcpy(state->buf + state->buflen, in, inlen);
- state->buflen += inlen;
-}
-
-void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen)
-{
- int i;
- /*WARN_ON(IS_ENABLED(DEBUG) &&
- (!out || !outlen || outlen > BLAKE2S_HASH_SIZE));*/
- blake2s_set_lastblock(state);
- memset(state->buf + state->buflen, 0,
- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
- blake2s_compress(state, state->buf, 1, state->buflen);
- for(i = 0; i < (sizeof(state->h)/sizeof(state->h[0])); i++)
- (state->h[i]) = htole32((state->h[i]));
-
- memcpy(out, state->h, outlen);
- explicit_bzero(state, sizeof(*state));
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen,
- const size_t inlen, const size_t keylen)
-{
- struct blake2s_state state;
- uint8_t x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(uint32_t)) = { 0 };
- uint8_t i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(uint32_t));
- int i;
-
- if (keylen > BLAKE2S_BLOCK_SIZE) {
- blake2s_init(&state, BLAKE2S_HASH_SIZE);
- blake2s_update(&state, key, keylen);
- blake2s_final(&state, x_key, BLAKE2S_HASH_SIZE);
- } else
- memcpy(x_key, key, keylen);
-
- for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
- x_key[i] ^= 0x36;
-
- blake2s_init(&state, BLAKE2S_HASH_SIZE);
- blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, i_hash, BLAKE2S_HASH_SIZE);
-
- for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
- x_key[i] ^= 0x5c ^ 0x36;
-
- blake2s_init(&state, BLAKE2S_HASH_SIZE);
- blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
- blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
- blake2s_final(&state, i_hash, BLAKE2S_HASH_SIZE);
-
- memcpy(out, i_hash, outlen);
- explicit_bzero(x_key, BLAKE2S_BLOCK_SIZE);
- explicit_bzero(i_hash, BLAKE2S_HASH_SIZE);
-}
diff --git a/sys/dev/if_wg/module/blake2s.h b/sys/dev/if_wg/module/blake2s.h
deleted file mode 100644
index 865de953fb25..000000000000
--- a/sys/dev/if_wg/module/blake2s.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <sys/types.h>
-
-#ifndef _BLAKE2S_H_
-#define _BLAKE2S_H_
-
-/*#define WARN_ON(a) if(a) printf("%s failed at %s:%d\n", #a, __FILE__, __LINE__)
-#define IS_ENABLED(...) true*/
-
-
-enum blake2s_lengths {
- BLAKE2S_BLOCK_SIZE = 64,
- BLAKE2S_HASH_SIZE = 32,
- BLAKE2S_KEY_SIZE = 32
-};
-
-struct blake2s_state {
- uint32_t h[8];
- uint32_t t[2];
- uint32_t f[2];
- uint8_t buf[BLAKE2S_BLOCK_SIZE];
- size_t buflen;
- uint8_t last_node;
-};
-
-void blake2s_init(struct blake2s_state *state, const size_t outlen);
-void blake2s_init_key(struct blake2s_state *state, const size_t outlen,
- const void *key, const size_t keylen);
-void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, uint8_t *out, const size_t outlen);
-
-static inline void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen,
- const size_t keylen)
-{
- struct blake2s_state state;
-
- /*WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
- outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
- (!key && keylen)));*/
-
- if (keylen)
- blake2s_init_key(&state, outlen, key, keylen);
- else
- blake2s_init(&state, outlen);
-
- blake2s_update(&state, in, inlen);
- blake2s_final(&state, out, outlen);
-}
-
-void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key,
- const size_t outlen, const size_t inlen, const size_t keylen);
-
-#endif /* _BLAKE2S_H_ */
diff --git a/sys/dev/if_wg/module/chacha20-x86_64.S b/sys/dev/if_wg/module/chacha20-x86_64.S
deleted file mode 100644
index 0edb79483758..000000000000
--- a/sys/dev/if_wg/module/chacha20-x86_64.S
+++ /dev/null
@@ -1,2834 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-//
-// Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-// Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-// Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-//
-// This code is taken from the OpenSSL project but the author, Andy Polyakov,
-// has relicensed it under the licenses specified in the SPDX header above.
-// The original headers, including the original license headers, are
-// included below for completeness.
-//
-// ====================================================================
-// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-// project. The module is, however, dual licensed under OpenSSL and
-// CRYPTOGAMS licenses depending on where you obtain it. For further
-// details see http://www.openssl.org/~appro/cryptogams/.
-// ====================================================================
-//
-// November 2014
-//
-// ChaCha20 for x86_64.
-//
-// December 2016
-//
-// Add AVX512F code path.
-//
-// December 2017
-//
-// Add AVX512VL code path.
-//
-// Performance in cycles per byte out of large buffer.
-//
-// IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
-//
-// P4 9.48/+99% - -
-// Core2 7.83/+55% 7.90/5.76 4.35
-// Westmere 7.19/+50% 5.60/4.50 3.00
-// Sandy Bridge 8.31/+42% 5.45/4.00 2.72
-// Ivy Bridge 6.71/+46% 5.40/? 2.41
-// Haswell 5.92/+43% 5.20/3.45 2.42 1.23
-// Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
-// Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
-// Knights L 11.7/- ? 9.60(iii) 0.80
-// Goldmont 10.6/+17% 5.10/3.52 3.28
-// Sledgehammer 7.28/+52% - -
-// Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
-// Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
-// VIA Nano 10.5/+46% 6.72/6.88 6.05
-//
-// (i) compared to older gcc 3.x one can observe >2x improvement on
-// most platforms;
-// (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
-// by chacha20_poly1305_tls_cipher, results are EVP-free;
-// (iii) this is not optimal result for Atom because of MSROM
-// limitations, SSE2 can do better, but gain is considered too
-// low to justify the [maintenance] effort;
-// (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
-// and 4.85 for 128-byte inputs;
-// (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
-// (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
-// cpb in single thread, the corresponding capability is suppressed;
-
-//#include <linux/linkage.h>
-.section .rodata.cst16.Lzero, "aM", @progbits, 16
-.align 16
-.Lzero:
-.long 0,0,0,0
-.section .rodata.cst16.Lone, "aM", @progbits, 16
-.align 16
-.Lone:
-.long 1,0,0,0
-.section .rodata.cst16.Linc, "aM", @progbits, 16
-.align 16
-.Linc:
-.long 0,1,2,3
-.section .rodata.cst16.Lfour, "aM", @progbits, 16
-.align 16
-.Lfour:
-.long 4,4,4,4
-.section .rodata.cst32.Lincy, "aM", @progbits, 32
-.align 32
-.Lincy:
-.long 0,2,4,6,1,3,5,7
-.section .rodata.cst32.Leight, "aM", @progbits, 32
-.align 32
-.Leight:
-.long 8,8,8,8,8,8,8,8
-.section .rodata.cst16.Lrot16, "aM", @progbits, 16
-.align 16
-.Lrot16:
-.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
-.section .rodata.cst16.Lrot24, "aM", @progbits, 16
-.align 16
-.Lrot24:
-.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
-.section .rodata.cst32.Ltwoy, "aM", @progbits, 32
-.align 32
-.Ltwoy:
-.long 2,0,0,0, 2,0,0,0
-.section .rodata.cst64.Lzeroz, "aM", @progbits, 64
-.align 64
-.Lzeroz:
-.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
-.section .rodata.cst64.Lfourz, "aM", @progbits, 64
-.align 64
-.Lfourz:
-.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
-.section .rodata.cst64.Lincz, "aM", @progbits, 64
-.align 64
-.Lincz:
-.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
-.section .rodata.cst64.Lsixteen, "aM", @progbits, 64
-.align 64
-.Lsixteen:
-.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
-.section .rodata.cst16.Lsigma, "aM", @progbits, 16
-.align 16
-.Lsigma:
-.ascii "expand 32-byte k"
-.text
-#ifdef CONFIG_AS_SSSE3
-.align 32
-SYM_FUNC_START(hchacha20_ssse3)
-.Lhchacha20_ssse3:
- movdqa .Lsigma(%rip),%xmm0
- movdqu (%rdx),%xmm1
- movdqu 16(%rdx),%xmm2
- movdqu (%rsi),%xmm3
- # This code is only used when targeting kernel.
- # If targeting win64, xmm{6,7} preserving needs to be added.
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
- mov $10,%r8 # reuse %r8
- jmp 1f
-.align 32
-1:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $147,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $57,%xmm2,%xmm2
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $57,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $147,%xmm2,%xmm2
- dec %r8
- jnz 1b
- movdqu %xmm0, (%rdi)
- movdqu %xmm3, 16(%rdi)
- ret
-SYM_FUNC_END(hchacha20_ssse3)
-.align 32
-SYM_FUNC_START(chacha20_ssse3)
-.Lchacha20_ssse3:
- lea 8(%rsp),%r10 # frame pointer
- cmp $128,%rdx # we might throw away some data,
- je .Lchacha20_128
- ja .Lchacha20_4x # but overall it won't be slower
-
-.Ldo_ssse3_after_all:
- sub $64+8,%rsp
- and $-16,%rsp
- movdqa .Lsigma(%rip),%xmm0
- movdqu (%rcx),%xmm1
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
-
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
- mov $10,%r8 # reuse %r8
- jmp .Loop_ssse3
-
-.align 32
-.Loop_outer_ssse3:
- movdqa .Lone(%rip),%xmm3
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x10(%rsp),%xmm1
- movdqa 0x20(%rsp),%xmm2
- paddd 0x30(%rsp),%xmm3
- mov $10,%r8
- movdqa %xmm3,0x30(%rsp)
- jmp .Loop_ssse3
-
-.align 32
-.Loop_ssse3:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $147,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $57,%xmm2,%xmm2
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm6,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm7,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $57,%xmm0,%xmm0
- pshufd $78,%xmm3,%xmm3
- pshufd $147,%xmm2,%xmm2
- dec %r8
- jnz .Loop_ssse3
- paddd 0x00(%rsp),%xmm0
- paddd 0x10(%rsp),%xmm1
- paddd 0x20(%rsp),%xmm2
- paddd 0x30(%rsp),%xmm3
-
- cmp $64,%rdx
- jb .Ltail_ssse3
-
- movdqu 0x00(%rsi),%xmm4
- movdqu 0x10(%rsi),%xmm5
- pxor %xmm4,%xmm0 # xor with input
- movdqu 0x20(%rsi),%xmm4
- pxor %xmm5,%xmm1
- movdqu 0x30(%rsi),%xmm5
- lea 0x40(%rsi),%rsi # inp+=64
- pxor %xmm4,%xmm2
- pxor %xmm5,%xmm3
-
- movdqu %xmm0,0x00(%rdi) # write output
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm3,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- sub $64,%rdx
- jnz .Loop_outer_ssse3
-
- jmp .Ldone_ssse3
-
-.align 16
-.Ltail_ssse3:
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
- xor %r8,%r8
-
-.Loop_tail_ssse3:
- movzb (%rsi,%r8),%eax
- movzb (%rsp,%r8),%ecx
- lea 1(%r8),%r8
- xor %ecx,%eax
- mov %al,-1(%rdi,%r8)
- dec %rdx
- jnz .Loop_tail_ssse3
-
-.Ldone_ssse3:
- lea -8(%r10),%rsp
-.Lssse3_epilogue:
- ret
-SYM_FUNC_END(chacha20_ssse3)
-.type chacha20_128,@function
-.align 32
-chacha20_128:
-.Lchacha20_128:
- lea 8(%rsp),%r10 # frame pointer
- sub $64+8,%rsp
- and $-16,%rsp
- movdqa .Lsigma(%rip),%xmm8
- movdqu (%rcx),%xmm9
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa .Lone(%rip),%xmm1
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
-
- movdqa %xmm8,%xmm10
- movdqa %xmm8,0x00(%rsp)
- movdqa %xmm9,%xmm11
- movdqa %xmm9,0x10(%rsp)
- movdqa %xmm2,%xmm0
- movdqa %xmm2,0x20(%rsp)
- paddd %xmm3,%xmm1
- movdqa %xmm3,0x30(%rsp)
- mov $10,%r8 # reuse %r8
- jmp .Loop_128
-
-.align 32
-.Loop_128:
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm6,%xmm3
- pshufb %xmm6,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $20,%xmm9
- movdqa %xmm11,%xmm5
- pslld $12,%xmm4
- psrld $20,%xmm11
- por %xmm4,%xmm9
- pslld $12,%xmm5
- por %xmm5,%xmm11
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm7,%xmm3
- pshufb %xmm7,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $25,%xmm9
- movdqa %xmm11,%xmm5
- pslld $7,%xmm4
- psrld $25,%xmm11
- por %xmm4,%xmm9
- pslld $7,%xmm5
- por %xmm5,%xmm11
- pshufd $147,%xmm8,%xmm8
- pshufd $78,%xmm3,%xmm3
- pshufd $57,%xmm2,%xmm2
- pshufd $147,%xmm10,%xmm10
- pshufd $78,%xmm1,%xmm1
- pshufd $57,%xmm0,%xmm0
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm6,%xmm3
- pshufb %xmm6,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $20,%xmm9
- movdqa %xmm11,%xmm5
- pslld $12,%xmm4
- psrld $20,%xmm11
- por %xmm4,%xmm9
- pslld $12,%xmm5
- por %xmm5,%xmm11
- paddd %xmm9,%xmm8
- pxor %xmm8,%xmm3
- paddd %xmm11,%xmm10
- pxor %xmm10,%xmm1
- pshufb %xmm7,%xmm3
- pshufb %xmm7,%xmm1
- paddd %xmm3,%xmm2
- paddd %xmm1,%xmm0
- pxor %xmm2,%xmm9
- pxor %xmm0,%xmm11
- movdqa %xmm9,%xmm4
- psrld $25,%xmm9
- movdqa %xmm11,%xmm5
- pslld $7,%xmm4
- psrld $25,%xmm11
- por %xmm4,%xmm9
- pslld $7,%xmm5
- por %xmm5,%xmm11
- pshufd $57,%xmm8,%xmm8
- pshufd $78,%xmm3,%xmm3
- pshufd $147,%xmm2,%xmm2
- pshufd $57,%xmm10,%xmm10
- pshufd $78,%xmm1,%xmm1
- pshufd $147,%xmm0,%xmm0
- dec %r8
- jnz .Loop_128
- paddd 0x00(%rsp),%xmm8
- paddd 0x10(%rsp),%xmm9
- paddd 0x20(%rsp),%xmm2
- paddd 0x30(%rsp),%xmm3
- paddd .Lone(%rip),%xmm1
- paddd 0x00(%rsp),%xmm10
- paddd 0x10(%rsp),%xmm11
- paddd 0x20(%rsp),%xmm0
- paddd 0x30(%rsp),%xmm1
-
- movdqu 0x00(%rsi),%xmm4
- movdqu 0x10(%rsi),%xmm5
- pxor %xmm4,%xmm8 # xor with input
- movdqu 0x20(%rsi),%xmm4
- pxor %xmm5,%xmm9
- movdqu 0x30(%rsi),%xmm5
- pxor %xmm4,%xmm2
- movdqu 0x40(%rsi),%xmm4
- pxor %xmm5,%xmm3
- movdqu 0x50(%rsi),%xmm5
- pxor %xmm4,%xmm10
- movdqu 0x60(%rsi),%xmm4
- pxor %xmm5,%xmm11
- movdqu 0x70(%rsi),%xmm5
- pxor %xmm4,%xmm0
- pxor %xmm5,%xmm1
-
- movdqu %xmm8,0x00(%rdi) # write output
- movdqu %xmm9,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm3,0x30(%rdi)
- movdqu %xmm10,0x40(%rdi)
- movdqu %xmm11,0x50(%rdi)
- movdqu %xmm0,0x60(%rdi)
- movdqu %xmm1,0x70(%rdi)
- lea -8(%r10),%rsp
-.L128_epilogue:
- ret
-.size chacha20_128,.-chacha20_128
-.type chacha20_4x,@function
-.align 32
-chacha20_4x:
-.Lchacha20_4x:
- lea 8(%rsp),%r10 # frame pointer
- cmp $192,%rdx
- ja .Lproceed4x
-.Lproceed4x:
- sub $0x140+8,%rsp
- and $-16,%rsp
- movdqa .Lsigma(%rip),%xmm11 # key[0]
- movdqu (%rcx),%xmm15 # key[1]
- movdqu 16(%rcx),%xmm7 # key[2]
- movdqu (%r8),%xmm3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- pshufd $0x00,%xmm11,%xmm8 # smash key by lanes...
- pshufd $0x55,%xmm11,%xmm9
- movdqa %xmm8,0x40(%rsp) # ... and offload
- pshufd $0xaa,%xmm11,%xmm10
- movdqa %xmm9,0x50(%rsp)
- pshufd $0xff,%xmm11,%xmm11
- movdqa %xmm10,0x60(%rsp)
- movdqa %xmm11,0x70(%rsp)
-
- pshufd $0x00,%xmm15,%xmm12
- pshufd $0x55,%xmm15,%xmm13
- movdqa %xmm12,0x80-0x100(%rcx)
- pshufd $0xaa,%xmm15,%xmm14
- movdqa %xmm13,0x90-0x100(%rcx)
- pshufd $0xff,%xmm15,%xmm15
- movdqa %xmm14,0xa0-0x100(%rcx)
- movdqa %xmm15,0xb0-0x100(%rcx)
-
- pshufd $0x00,%xmm7,%xmm4 # ""
- pshufd $0x55,%xmm7,%xmm5 # ""
- movdqa %xmm4,0xc0-0x100(%rcx)
- pshufd $0xaa,%xmm7,%xmm6 # ""
- movdqa %xmm5,0xd0-0x100(%rcx)
- pshufd $0xff,%xmm7,%xmm7 # ""
- movdqa %xmm6,0xe0-0x100(%rcx)
- movdqa %xmm7,0xf0-0x100(%rcx)
-
- pshufd $0x00,%xmm3,%xmm0
- pshufd $0x55,%xmm3,%xmm1
- paddd .Linc(%rip),%xmm0 # don't save counters yet
- pshufd $0xaa,%xmm3,%xmm2
- movdqa %xmm1,0x110-0x100(%rcx)
- pshufd $0xff,%xmm3,%xmm3
- movdqa %xmm2,0x120-0x100(%rcx)
- movdqa %xmm3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4x
-
-.align 32
-.Loop_outer4x:
- movdqa 0x40(%rsp),%xmm8 # re-load smashed key
- movdqa 0x50(%rsp),%xmm9
- movdqa 0x60(%rsp),%xmm10
- movdqa 0x70(%rsp),%xmm11
- movdqa 0x80-0x100(%rcx),%xmm12
- movdqa 0x90-0x100(%rcx),%xmm13
- movdqa 0xa0-0x100(%rcx),%xmm14
- movdqa 0xb0-0x100(%rcx),%xmm15
- movdqa 0xc0-0x100(%rcx),%xmm4 # ""
- movdqa 0xd0-0x100(%rcx),%xmm5 # ""
- movdqa 0xe0-0x100(%rcx),%xmm6 # ""
- movdqa 0xf0-0x100(%rcx),%xmm7 # ""
- movdqa 0x100-0x100(%rcx),%xmm0
- movdqa 0x110-0x100(%rcx),%xmm1
- movdqa 0x120-0x100(%rcx),%xmm2
- movdqa 0x130-0x100(%rcx),%xmm3
- paddd .Lfour(%rip),%xmm0 # next SIMD counters
-
-.Loop_enter4x:
- movdqa %xmm6,0x20(%rsp) # SIMD equivalent of "%nox"
- movdqa %xmm7,0x30(%rsp) # SIMD equivalent of "%nox"
- movdqa (%r9),%xmm7 # .Lrot16(%rip)
- mov $10,%eax
- movdqa %xmm0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4x
-
-.align 32
-.Loop4x:
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
- pshufb %xmm7,%xmm0
- pshufb %xmm7,%xmm1
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm6
- pslld $12,%xmm12
- psrld $20,%xmm6
- movdqa %xmm13,%xmm7
- pslld $12,%xmm13
- por %xmm6,%xmm12
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm13
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
- pshufb %xmm6,%xmm0
- pshufb %xmm6,%xmm1
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm7
- pslld $7,%xmm12
- psrld $25,%xmm7
- movdqa %xmm13,%xmm6
- pslld $7,%xmm13
- por %xmm7,%xmm12
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm13
- movdqa %xmm4,0(%rsp)
- movdqa %xmm5,16(%rsp)
- movdqa 32(%rsp),%xmm4
- movdqa 48(%rsp),%xmm5
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
- pshufb %xmm7,%xmm2
- pshufb %xmm7,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm6
- pslld $12,%xmm14
- psrld $20,%xmm6
- movdqa %xmm15,%xmm7
- pslld $12,%xmm15
- por %xmm6,%xmm14
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm15
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
- pshufb %xmm6,%xmm2
- pshufb %xmm6,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm7
- pslld $7,%xmm14
- psrld $25,%xmm7
- movdqa %xmm15,%xmm6
- pslld $7,%xmm15
- por %xmm7,%xmm14
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm15
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
- pshufb %xmm7,%xmm3
- pshufb %xmm7,%xmm0
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm6
- pslld $12,%xmm13
- psrld $20,%xmm6
- movdqa %xmm14,%xmm7
- pslld $12,%xmm14
- por %xmm6,%xmm13
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm14
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
- pshufb %xmm6,%xmm3
- pshufb %xmm6,%xmm0
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm7
- pslld $7,%xmm13
- psrld $25,%xmm7
- movdqa %xmm14,%xmm6
- pslld $7,%xmm14
- por %xmm7,%xmm13
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm14
- movdqa %xmm4,32(%rsp)
- movdqa %xmm5,48(%rsp)
- movdqa 0(%rsp),%xmm4
- movdqa 16(%rsp),%xmm5
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
- pshufb %xmm7,%xmm1
- pshufb %xmm7,%xmm2
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm6
- pslld $12,%xmm15
- psrld $20,%xmm6
- movdqa %xmm12,%xmm7
- pslld $12,%xmm12
- por %xmm6,%xmm15
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm12
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
- pshufb %xmm6,%xmm1
- pshufb %xmm6,%xmm2
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm7
- pslld $7,%xmm15
- psrld $25,%xmm7
- movdqa %xmm12,%xmm6
- pslld $7,%xmm12
- por %xmm7,%xmm15
- psrld $25,%xmm6
- movdqa (%r9),%xmm7
- por %xmm6,%xmm12
- dec %eax
- jnz .Loop4x
-
- paddd 0x40(%rsp),%xmm8 # accumulate key material
- paddd 0x50(%rsp),%xmm9
- paddd 0x60(%rsp),%xmm10
- paddd 0x70(%rsp),%xmm11
-
- movdqa %xmm8,%xmm6 # "de-interlace" data
- punpckldq %xmm9,%xmm8
- movdqa %xmm10,%xmm7
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm9,%xmm6
- punpckhdq %xmm11,%xmm7
- movdqa %xmm8,%xmm9
- punpcklqdq %xmm10,%xmm8 # "a0"
- movdqa %xmm6,%xmm11
- punpcklqdq %xmm7,%xmm6 # "a2"
- punpckhqdq %xmm10,%xmm9 # "a1"
- punpckhqdq %xmm7,%xmm11 # "a3"
- paddd 0x80-0x100(%rcx),%xmm12
- paddd 0x90-0x100(%rcx),%xmm13
- paddd 0xa0-0x100(%rcx),%xmm14
- paddd 0xb0-0x100(%rcx),%xmm15
-
- movdqa %xmm8,0x00(%rsp) # offload
- movdqa %xmm9,0x10(%rsp)
- movdqa 0x20(%rsp),%xmm8 # "xc2"
- movdqa 0x30(%rsp),%xmm9 # "xc3"
-
- movdqa %xmm12,%xmm10
- punpckldq %xmm13,%xmm12
- movdqa %xmm14,%xmm7
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm13,%xmm10
- punpckhdq %xmm15,%xmm7
- movdqa %xmm12,%xmm13
- punpcklqdq %xmm14,%xmm12 # "b0"
- movdqa %xmm10,%xmm15
- punpcklqdq %xmm7,%xmm10 # "b2"
- punpckhqdq %xmm14,%xmm13 # "b1"
- punpckhqdq %xmm7,%xmm15 # "b3"
- paddd 0xc0-0x100(%rcx),%xmm4
- paddd 0xd0-0x100(%rcx),%xmm5
- paddd 0xe0-0x100(%rcx),%xmm8
- paddd 0xf0-0x100(%rcx),%xmm9
-
- movdqa %xmm6,0x20(%rsp) # keep offloading
- movdqa %xmm11,0x30(%rsp)
-
- movdqa %xmm4,%xmm14
- punpckldq %xmm5,%xmm4
- movdqa %xmm8,%xmm7
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm5,%xmm14
- punpckhdq %xmm9,%xmm7
- movdqa %xmm4,%xmm5
- punpcklqdq %xmm8,%xmm4 # "c0"
- movdqa %xmm14,%xmm9
- punpcklqdq %xmm7,%xmm14 # "c2"
- punpckhqdq %xmm8,%xmm5 # "c1"
- punpckhqdq %xmm7,%xmm9 # "c3"
- paddd 0x100-0x100(%rcx),%xmm0
- paddd 0x110-0x100(%rcx),%xmm1
- paddd 0x120-0x100(%rcx),%xmm2
- paddd 0x130-0x100(%rcx),%xmm3
-
- movdqa %xmm0,%xmm8
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm8
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0 # "d0"
- movdqa %xmm8,%xmm3
- punpcklqdq %xmm7,%xmm8 # "d2"
- punpckhqdq %xmm2,%xmm1 # "d1"
- punpckhqdq %xmm7,%xmm3 # "d3"
- cmp $64*4,%rdx
- jb .Ltail4x
-
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- lea 0x80(%rsi),%rsi # size optimization
- pxor 0x10(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
-
- movdqu %xmm6,0x40(%rdi)
- movdqu 0x00(%rsi),%xmm6
- movdqu %xmm11,0x50(%rdi)
- movdqu 0x10(%rsi),%xmm11
- movdqu %xmm2,0x60(%rdi)
- movdqu 0x20(%rsi),%xmm2
- movdqu %xmm7,0x70(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
- movdqu 0x30(%rsi),%xmm7
- pxor 0x20(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- lea 0x80(%rsi),%rsi # inp+=64*4
- pxor 0x30(%rsp),%xmm6
- pxor %xmm15,%xmm11
- pxor %xmm9,%xmm2
- pxor %xmm3,%xmm7
- movdqu %xmm6,0x40(%rdi)
- movdqu %xmm11,0x50(%rdi)
- movdqu %xmm2,0x60(%rdi)
- movdqu %xmm7,0x70(%rdi)
- lea 0x80(%rdi),%rdi # out+=64*4
-
- sub $64*4,%rdx
- jnz .Loop_outer4x
-
- jmp .Ldone4x
-
-.Ltail4x:
- cmp $192,%rdx
- jae .L192_or_more4x
- cmp $128,%rdx
- jae .L128_or_more4x
- cmp $64,%rdx
- jae .L64_or_more4x
-
- #movdqa 0x00(%rsp),%xmm6 # is offloaded, remember?
- xor %r9,%r9
- #movdqa %xmm6,0x00(%rsp)
- movdqa %xmm12,0x10(%rsp)
- movdqa %xmm4,0x20(%rsp)
- movdqa %xmm0,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L64_or_more4x:
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0x00(%rdi)
- movdqu %xmm11,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm7,0x30(%rdi)
- je .Ldone4x
-
- movdqa 0x10(%rsp),%xmm6 # is offloaded, remember?
- lea 0x40(%rsi),%rsi # inp+=64*1
- xor %r9,%r9
- movdqa %xmm6,0x00(%rsp)
- movdqa %xmm13,0x10(%rsp)
- lea 0x40(%rdi),%rdi # out+=64*1
- movdqa %xmm5,0x20(%rsp)
- sub $64,%rdx # len-=64*1
- movdqa %xmm1,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L128_or_more4x:
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- pxor 0x10(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
- movdqu %xmm6,0x40(%rdi)
- movdqu %xmm11,0x50(%rdi)
- movdqu %xmm2,0x60(%rdi)
- movdqu %xmm7,0x70(%rdi)
- je .Ldone4x
-
- movdqa 0x20(%rsp),%xmm6 # is offloaded, remember?
- lea 0x80(%rsi),%rsi # inp+=64*2
- xor %r9,%r9
- movdqa %xmm6,0x00(%rsp)
- movdqa %xmm10,0x10(%rsp)
- lea 0x80(%rdi),%rdi # out+=64*2
- movdqa %xmm14,0x20(%rsp)
- sub $128,%rdx # len-=64*2
- movdqa %xmm8,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L192_or_more4x:
- movdqu 0x00(%rsi),%xmm6 # xor with input
- movdqu 0x10(%rsi),%xmm11
- movdqu 0x20(%rsi),%xmm2
- movdqu 0x30(%rsi),%xmm7
- pxor 0x00(%rsp),%xmm6 # is offloaded, remember?
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
-
- movdqu %xmm6,0x00(%rdi)
- movdqu 0x40(%rsi),%xmm6
- movdqu %xmm11,0x10(%rdi)
- movdqu 0x50(%rsi),%xmm11
- movdqu %xmm2,0x20(%rdi)
- movdqu 0x60(%rsi),%xmm2
- movdqu %xmm7,0x30(%rdi)
- movdqu 0x70(%rsi),%xmm7
- lea 0x80(%rsi),%rsi # size optimization
- pxor 0x10(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
-
- movdqu %xmm6,0x40(%rdi)
- movdqu 0x00(%rsi),%xmm6
- movdqu %xmm11,0x50(%rdi)
- movdqu 0x10(%rsi),%xmm11
- movdqu %xmm2,0x60(%rdi)
- movdqu 0x20(%rsi),%xmm2
- movdqu %xmm7,0x70(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
- movdqu 0x30(%rsi),%xmm7
- pxor 0x20(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
- movdqu %xmm6,0x00(%rdi)
- movdqu %xmm11,0x10(%rdi)
- movdqu %xmm2,0x20(%rdi)
- movdqu %xmm7,0x30(%rdi)
- je .Ldone4x
-
- movdqa 0x30(%rsp),%xmm6 # is offloaded, remember?
- lea 0x40(%rsi),%rsi # inp+=64*3
- xor %r9,%r9
- movdqa %xmm6,0x00(%rsp)
- movdqa %xmm15,0x10(%rsp)
- lea 0x40(%rdi),%rdi # out+=64*3
- movdqa %xmm9,0x20(%rsp)
- sub $192,%rdx # len-=64*3
- movdqa %xmm3,0x30(%rsp)
-
-.Loop_tail4x:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail4x
-
-.Ldone4x:
- lea -8(%r10),%rsp
-.L4x_epilogue:
- ret
-.size chacha20_4x,.-chacha20_4x
-#endif
-#ifdef CONFIG_AS_AVX2
-.align 32
-SYM_FUNC_START(chacha20_avx2)
-.Lchacha20_avx2:
-.Lchacha20_8x:
- lea 8(%rsp),%r10 # frame register
- sub $0x280+8,%rsp
- and $-32,%rsp
- vzeroupper
-
- ################ stack layout
- # +0x00 SIMD equivalent of %r12d
- # ...
- # +0x80 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x200 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x280
-
- vbroadcasti128 .Lsigma(%rip),%ymm11 # key[0]
- vbroadcasti128 (%rcx),%ymm3 # key[1]
- vbroadcasti128 16(%rcx),%ymm15 # key[2]
- vbroadcasti128 (%r8),%ymm7 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea 0x200(%rsp),%rax # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- vpshufd $0x00,%ymm11,%ymm8 # smash key by lanes...
- vpshufd $0x55,%ymm11,%ymm9
- vmovdqa %ymm8,0x80-0x100(%rcx) # ... and offload
- vpshufd $0xaa,%ymm11,%ymm10
- vmovdqa %ymm9,0xa0-0x100(%rcx)
- vpshufd $0xff,%ymm11,%ymm11
- vmovdqa %ymm10,0xc0-0x100(%rcx)
- vmovdqa %ymm11,0xe0-0x100(%rcx)
-
- vpshufd $0x00,%ymm3,%ymm0
- vpshufd $0x55,%ymm3,%ymm1
- vmovdqa %ymm0,0x100-0x100(%rcx)
- vpshufd $0xaa,%ymm3,%ymm2
- vmovdqa %ymm1,0x120-0x100(%rcx)
- vpshufd $0xff,%ymm3,%ymm3
- vmovdqa %ymm2,0x140-0x100(%rcx)
- vmovdqa %ymm3,0x160-0x100(%rcx)
-
- vpshufd $0x00,%ymm15,%ymm12 # "xc0"
- vpshufd $0x55,%ymm15,%ymm13 # "xc1"
- vmovdqa %ymm12,0x180-0x200(%rax)
- vpshufd $0xaa,%ymm15,%ymm14 # "xc2"
- vmovdqa %ymm13,0x1a0-0x200(%rax)
- vpshufd $0xff,%ymm15,%ymm15 # "xc3"
- vmovdqa %ymm14,0x1c0-0x200(%rax)
- vmovdqa %ymm15,0x1e0-0x200(%rax)
-
- vpshufd $0x00,%ymm7,%ymm4
- vpshufd $0x55,%ymm7,%ymm5
- vpaddd .Lincy(%rip),%ymm4,%ymm4 # don't save counters yet
- vpshufd $0xaa,%ymm7,%ymm6
- vmovdqa %ymm5,0x220-0x200(%rax)
- vpshufd $0xff,%ymm7,%ymm7
- vmovdqa %ymm6,0x240-0x200(%rax)
- vmovdqa %ymm7,0x260-0x200(%rax)
-
- jmp .Loop_enter8x
-
-.align 32
-.Loop_outer8x:
- vmovdqa 0x80-0x100(%rcx),%ymm8 # re-load smashed key
- vmovdqa 0xa0-0x100(%rcx),%ymm9
- vmovdqa 0xc0-0x100(%rcx),%ymm10
- vmovdqa 0xe0-0x100(%rcx),%ymm11
- vmovdqa 0x100-0x100(%rcx),%ymm0
- vmovdqa 0x120-0x100(%rcx),%ymm1
- vmovdqa 0x140-0x100(%rcx),%ymm2
- vmovdqa 0x160-0x100(%rcx),%ymm3
- vmovdqa 0x180-0x200(%rax),%ymm12 # "xc0"
- vmovdqa 0x1a0-0x200(%rax),%ymm13 # "xc1"
- vmovdqa 0x1c0-0x200(%rax),%ymm14 # "xc2"
- vmovdqa 0x1e0-0x200(%rax),%ymm15 # "xc3"
- vmovdqa 0x200-0x200(%rax),%ymm4
- vmovdqa 0x220-0x200(%rax),%ymm5
- vmovdqa 0x240-0x200(%rax),%ymm6
- vmovdqa 0x260-0x200(%rax),%ymm7
- vpaddd .Leight(%rip),%ymm4,%ymm4 # next SIMD counters
-
-.Loop_enter8x:
- vmovdqa %ymm14,0x40(%rsp) # SIMD equivalent of "%nox"
- vmovdqa %ymm15,0x60(%rsp) # SIMD equivalent of "%nox"
- vbroadcasti128 (%r9),%ymm15
- vmovdqa %ymm4,0x200-0x200(%rax) # save SIMD counters
- mov $10,%eax
- jmp .Loop8x
-
-.align 32
-.Loop8x:
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $12,%ymm0,%ymm14
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $12,%ymm1,%ymm15
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $7,%ymm0,%ymm15
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $7,%ymm1,%ymm14
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vmovdqa %ymm12,0(%rsp)
- vmovdqa %ymm13,32(%rsp)
- vmovdqa 64(%rsp),%ymm12
- vmovdqa 96(%rsp),%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $12,%ymm2,%ymm14
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $12,%ymm3,%ymm15
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $7,%ymm2,%ymm15
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $7,%ymm3,%ymm14
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $12,%ymm1,%ymm14
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $12,%ymm2,%ymm15
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $7,%ymm1,%ymm15
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $7,%ymm2,%ymm14
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vmovdqa %ymm12,64(%rsp)
- vmovdqa %ymm13,96(%rsp)
- vmovdqa 0(%rsp),%ymm12
- vmovdqa 32(%rsp),%ymm13
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $12,%ymm3,%ymm14
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $12,%ymm0,%ymm15
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $7,%ymm3,%ymm15
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vbroadcasti128 (%r9),%ymm15
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $7,%ymm0,%ymm14
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- dec %eax
- jnz .Loop8x
-
- lea 0x200(%rsp),%rax # size optimization
- vpaddd 0x80-0x100(%rcx),%ymm8,%ymm8 # accumulate key
- vpaddd 0xa0-0x100(%rcx),%ymm9,%ymm9
- vpaddd 0xc0-0x100(%rcx),%ymm10,%ymm10
- vpaddd 0xe0-0x100(%rcx),%ymm11,%ymm11
-
- vpunpckldq %ymm9,%ymm8,%ymm14 # "de-interlace" data
- vpunpckldq %ymm11,%ymm10,%ymm15
- vpunpckhdq %ymm9,%ymm8,%ymm8
- vpunpckhdq %ymm11,%ymm10,%ymm10
- vpunpcklqdq %ymm15,%ymm14,%ymm9 # "a0"
- vpunpckhqdq %ymm15,%ymm14,%ymm14 # "a1"
- vpunpcklqdq %ymm10,%ymm8,%ymm11 # "a2"
- vpunpckhqdq %ymm10,%ymm8,%ymm8 # "a3"
- vpaddd 0x100-0x100(%rcx),%ymm0,%ymm0
- vpaddd 0x120-0x100(%rcx),%ymm1,%ymm1
- vpaddd 0x140-0x100(%rcx),%ymm2,%ymm2
- vpaddd 0x160-0x100(%rcx),%ymm3,%ymm3
-
- vpunpckldq %ymm1,%ymm0,%ymm10
- vpunpckldq %ymm3,%ymm2,%ymm15
- vpunpckhdq %ymm1,%ymm0,%ymm0
- vpunpckhdq %ymm3,%ymm2,%ymm2
- vpunpcklqdq %ymm15,%ymm10,%ymm1 # "b0"
- vpunpckhqdq %ymm15,%ymm10,%ymm10 # "b1"
- vpunpcklqdq %ymm2,%ymm0,%ymm3 # "b2"
- vpunpckhqdq %ymm2,%ymm0,%ymm0 # "b3"
- vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 # "de-interlace" further
- vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
- vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
- vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
- vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
- vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
- vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
- vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
- vmovdqa %ymm15,0x00(%rsp) # offload
- vmovdqa %ymm9,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm15 # %ymm15
- vmovdqa 0x60(%rsp),%ymm9 # %ymm9
-
- vpaddd 0x180-0x200(%rax),%ymm12,%ymm12
- vpaddd 0x1a0-0x200(%rax),%ymm13,%ymm13
- vpaddd 0x1c0-0x200(%rax),%ymm15,%ymm15
- vpaddd 0x1e0-0x200(%rax),%ymm9,%ymm9
-
- vpunpckldq %ymm13,%ymm12,%ymm2
- vpunpckldq %ymm9,%ymm15,%ymm8
- vpunpckhdq %ymm13,%ymm12,%ymm12
- vpunpckhdq %ymm9,%ymm15,%ymm15
- vpunpcklqdq %ymm8,%ymm2,%ymm13 # "c0"
- vpunpckhqdq %ymm8,%ymm2,%ymm2 # "c1"
- vpunpcklqdq %ymm15,%ymm12,%ymm9 # "c2"
- vpunpckhqdq %ymm15,%ymm12,%ymm12 # "c3"
- vpaddd 0x200-0x200(%rax),%ymm4,%ymm4
- vpaddd 0x220-0x200(%rax),%ymm5,%ymm5
- vpaddd 0x240-0x200(%rax),%ymm6,%ymm6
- vpaddd 0x260-0x200(%rax),%ymm7,%ymm7
-
- vpunpckldq %ymm5,%ymm4,%ymm15
- vpunpckldq %ymm7,%ymm6,%ymm8
- vpunpckhdq %ymm5,%ymm4,%ymm4
- vpunpckhdq %ymm7,%ymm6,%ymm6
- vpunpcklqdq %ymm8,%ymm15,%ymm5 # "d0"
- vpunpckhqdq %ymm8,%ymm15,%ymm15 # "d1"
- vpunpcklqdq %ymm6,%ymm4,%ymm7 # "d2"
- vpunpckhqdq %ymm6,%ymm4,%ymm4 # "d3"
- vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 # "de-interlace" further
- vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
- vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
- vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
- vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
- vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
- vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
- vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
- vmovdqa 0x00(%rsp),%ymm6 # was offloaded, remember?
- vmovdqa 0x20(%rsp),%ymm12
-
- cmp $64*8,%rdx
- jb .Ltail8x
-
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm12,%ymm12
- vpxor 0x20(%rsi),%ymm13,%ymm13
- vpxor 0x40(%rsi),%ymm10,%ymm10
- vpxor 0x60(%rsi),%ymm15,%ymm15
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm12,0x00(%rdi)
- vmovdqu %ymm13,0x20(%rdi)
- vmovdqu %ymm10,0x40(%rdi)
- vmovdqu %ymm15,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm14,%ymm14
- vpxor 0x20(%rsi),%ymm2,%ymm2
- vpxor 0x40(%rsi),%ymm3,%ymm3
- vpxor 0x60(%rsi),%ymm7,%ymm7
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm14,0x00(%rdi)
- vmovdqu %ymm2,0x20(%rdi)
- vmovdqu %ymm3,0x40(%rdi)
- vmovdqu %ymm7,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm11,%ymm11
- vpxor 0x20(%rsi),%ymm9,%ymm9
- vpxor 0x40(%rsi),%ymm0,%ymm0
- vpxor 0x60(%rsi),%ymm4,%ymm4
- lea 0x80(%rsi),%rsi # size optimization
- vmovdqu %ymm11,0x00(%rdi)
- vmovdqu %ymm9,0x20(%rdi)
- vmovdqu %ymm0,0x40(%rdi)
- vmovdqu %ymm4,0x60(%rdi)
- lea 0x80(%rdi),%rdi # size optimization
-
- sub $64*8,%rdx
- jnz .Loop_outer8x
-
- jmp .Ldone8x
-
-.Ltail8x:
- cmp $448,%rdx
- jae .L448_or_more8x
- cmp $384,%rdx
- jae .L384_or_more8x
- cmp $320,%rdx
- jae .L320_or_more8x
- cmp $256,%rdx
- jae .L256_or_more8x
- cmp $192,%rdx
- jae .L192_or_more8x
- cmp $128,%rdx
- jae .L128_or_more8x
- cmp $64,%rdx
- jae .L64_or_more8x
-
- xor %r9,%r9
- vmovdqa %ymm6,0x00(%rsp)
- vmovdqa %ymm8,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L64_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- je .Ldone8x
-
- lea 0x40(%rsi),%rsi # inp+=64*1
- xor %r9,%r9
- vmovdqa %ymm1,0x00(%rsp)
- lea 0x40(%rdi),%rdi # out+=64*1
- sub $64,%rdx # len-=64*1
- vmovdqa %ymm5,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L128_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- je .Ldone8x
-
- lea 0x80(%rsi),%rsi # inp+=64*2
- xor %r9,%r9
- vmovdqa %ymm12,0x00(%rsp)
- lea 0x80(%rdi),%rdi # out+=64*2
- sub $128,%rdx # len-=64*2
- vmovdqa %ymm13,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L192_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- je .Ldone8x
-
- lea 0xc0(%rsi),%rsi # inp+=64*3
- xor %r9,%r9
- vmovdqa %ymm10,0x00(%rsp)
- lea 0xc0(%rdi),%rdi # out+=64*3
- sub $192,%rdx # len-=64*3
- vmovdqa %ymm15,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L256_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- je .Ldone8x
-
- lea 0x100(%rsi),%rsi # inp+=64*4
- xor %r9,%r9
- vmovdqa %ymm14,0x00(%rsp)
- lea 0x100(%rdi),%rdi # out+=64*4
- sub $256,%rdx # len-=64*4
- vmovdqa %ymm2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L320_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vpxor 0x100(%rsi),%ymm14,%ymm14
- vpxor 0x120(%rsi),%ymm2,%ymm2
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- vmovdqu %ymm14,0x100(%rdi)
- vmovdqu %ymm2,0x120(%rdi)
- je .Ldone8x
-
- lea 0x140(%rsi),%rsi # inp+=64*5
- xor %r9,%r9
- vmovdqa %ymm3,0x00(%rsp)
- lea 0x140(%rdi),%rdi # out+=64*5
- sub $320,%rdx # len-=64*5
- vmovdqa %ymm7,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L384_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vpxor 0x100(%rsi),%ymm14,%ymm14
- vpxor 0x120(%rsi),%ymm2,%ymm2
- vpxor 0x140(%rsi),%ymm3,%ymm3
- vpxor 0x160(%rsi),%ymm7,%ymm7
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- vmovdqu %ymm14,0x100(%rdi)
- vmovdqu %ymm2,0x120(%rdi)
- vmovdqu %ymm3,0x140(%rdi)
- vmovdqu %ymm7,0x160(%rdi)
- je .Ldone8x
-
- lea 0x180(%rsi),%rsi # inp+=64*6
- xor %r9,%r9
- vmovdqa %ymm11,0x00(%rsp)
- lea 0x180(%rdi),%rdi # out+=64*6
- sub $384,%rdx # len-=64*6
- vmovdqa %ymm9,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L448_or_more8x:
- vpxor 0x00(%rsi),%ymm6,%ymm6 # xor with input
- vpxor 0x20(%rsi),%ymm8,%ymm8
- vpxor 0x40(%rsi),%ymm1,%ymm1
- vpxor 0x60(%rsi),%ymm5,%ymm5
- vpxor 0x80(%rsi),%ymm12,%ymm12
- vpxor 0xa0(%rsi),%ymm13,%ymm13
- vpxor 0xc0(%rsi),%ymm10,%ymm10
- vpxor 0xe0(%rsi),%ymm15,%ymm15
- vpxor 0x100(%rsi),%ymm14,%ymm14
- vpxor 0x120(%rsi),%ymm2,%ymm2
- vpxor 0x140(%rsi),%ymm3,%ymm3
- vpxor 0x160(%rsi),%ymm7,%ymm7
- vpxor 0x180(%rsi),%ymm11,%ymm11
- vpxor 0x1a0(%rsi),%ymm9,%ymm9
- vmovdqu %ymm6,0x00(%rdi)
- vmovdqu %ymm8,0x20(%rdi)
- vmovdqu %ymm1,0x40(%rdi)
- vmovdqu %ymm5,0x60(%rdi)
- vmovdqu %ymm12,0x80(%rdi)
- vmovdqu %ymm13,0xa0(%rdi)
- vmovdqu %ymm10,0xc0(%rdi)
- vmovdqu %ymm15,0xe0(%rdi)
- vmovdqu %ymm14,0x100(%rdi)
- vmovdqu %ymm2,0x120(%rdi)
- vmovdqu %ymm3,0x140(%rdi)
- vmovdqu %ymm7,0x160(%rdi)
- vmovdqu %ymm11,0x180(%rdi)
- vmovdqu %ymm9,0x1a0(%rdi)
- je .Ldone8x
-
- lea 0x1c0(%rsi),%rsi # inp+=64*7
- xor %r9,%r9
- vmovdqa %ymm0,0x00(%rsp)
- lea 0x1c0(%rdi),%rdi # out+=64*7
- sub $448,%rdx # len-=64*7
- vmovdqa %ymm4,0x20(%rsp)
-
-.Loop_tail8x:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail8x
-
-.Ldone8x:
- vzeroall
- lea -8(%r10),%rsp
-.L8x_epilogue:
- ret
-SYM_FUNC_END(chacha20_avx2)
-#endif
-#ifdef CONFIG_AS_AVX512
-.align 32
-SYM_FUNC_START(chacha20_avx512)
-.Lchacha20_avx512:
- lea 8(%rsp),%r10 # frame pointer
- cmp $512,%rdx
- ja .Lchacha20_16x
-
- sub $64+8,%rsp
- and $-64,%rsp
- vbroadcasti32x4 .Lsigma(%rip),%zmm0
- vbroadcasti32x4 (%rcx),%zmm1
- vbroadcasti32x4 16(%rcx),%zmm2
- vbroadcasti32x4 (%r8),%zmm3
-
- vmovdqa32 %zmm0,%zmm16
- vmovdqa32 %zmm1,%zmm17
- vmovdqa32 %zmm2,%zmm18
- vpaddd .Lzeroz(%rip),%zmm3,%zmm3
- vmovdqa32 .Lfourz(%rip),%zmm20
- mov $10,%r8 # reuse %r8
- vmovdqa32 %zmm3,%zmm19
- jmp .Loop_avx512
-
-.align 16
-.Loop_outer_avx512:
- vmovdqa32 %zmm16,%zmm0
- vmovdqa32 %zmm17,%zmm1
- vmovdqa32 %zmm18,%zmm2
- vpaddd %zmm20,%zmm19,%zmm3
- mov $10,%r8
- vmovdqa32 %zmm3,%zmm19
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $16,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $12,%zmm1,%zmm1
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $8,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $7,%zmm1,%zmm1
- vpshufd $78,%zmm2,%zmm2
- vpshufd $57,%zmm1,%zmm1
- vpshufd $147,%zmm3,%zmm3
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $16,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $12,%zmm1,%zmm1
- vpaddd %zmm1,%zmm0,%zmm0
- vpxord %zmm0,%zmm3,%zmm3
- vprold $8,%zmm3,%zmm3
- vpaddd %zmm3,%zmm2,%zmm2
- vpxord %zmm2,%zmm1,%zmm1
- vprold $7,%zmm1,%zmm1
- vpshufd $78,%zmm2,%zmm2
- vpshufd $147,%zmm1,%zmm1
- vpshufd $57,%zmm3,%zmm3
- dec %r8
- jnz .Loop_avx512
- vpaddd %zmm16,%zmm0,%zmm0
- vpaddd %zmm17,%zmm1,%zmm1
- vpaddd %zmm18,%zmm2,%zmm2
- vpaddd %zmm19,%zmm3,%zmm3
-
- sub $64,%rdx
- jb .Ltail64_avx512
-
- vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm1,%xmm5
- vpxor 0x20(%rsi),%xmm2,%xmm6
- vpxor 0x30(%rsi),%xmm3,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 $1,%zmm0,%xmm4
- vextracti32x4 $1,%zmm1,%xmm5
- vextracti32x4 $1,%zmm2,%xmm6
- vextracti32x4 $1,%zmm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 $2,%zmm0,%xmm4
- vextracti32x4 $2,%zmm1,%xmm5
- vextracti32x4 $2,%zmm2,%xmm6
- vextracti32x4 $2,%zmm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 $3,%zmm0,%xmm4
- vextracti32x4 $3,%zmm1,%xmm5
- vextracti32x4 $3,%zmm2,%xmm6
- vextracti32x4 $3,%zmm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jnz .Loop_outer_avx512
-
- jmp .Ldone_avx512
-
-.align 16
-.Ltail64_avx512:
- vmovdqa %xmm0,0x00(%rsp)
- vmovdqa %xmm1,0x10(%rsp)
- vmovdqa %xmm2,0x20(%rsp)
- vmovdqa %xmm3,0x30(%rsp)
- add $64,%rdx
- jmp .Loop_tail_avx512
-
-.align 16
-.Ltail_avx512:
- vmovdqa %xmm4,0x00(%rsp)
- vmovdqa %xmm5,0x10(%rsp)
- vmovdqa %xmm6,0x20(%rsp)
- vmovdqa %xmm7,0x30(%rsp)
- add $64,%rdx
-
-.Loop_tail_avx512:
- movzb (%rsi,%r8),%eax
- movzb (%rsp,%r8),%ecx
- lea 1(%r8),%r8
- xor %ecx,%eax
- mov %al,-1(%rdi,%r8)
- dec %rdx
- jnz .Loop_tail_avx512
-
- vmovdqu32 %zmm16,0x00(%rsp)
-
-.Ldone_avx512:
- vzeroall
- lea -8(%r10),%rsp
-.Lavx512_epilogue:
- ret
-SYM_FUNC_END(chacha20_avx512)
-.align 32
-SYM_FUNC_START(chacha20_avx512vl)
-.Lchacha20_avx512vl:
- lea 8(%rsp),%r10 # frame pointer
- cmp $128,%rdx
- ja .Lchacha20_8xvl
-
- sub $64+8,%rsp
- and $-32,%rsp
- vbroadcasti128 .Lsigma(%rip),%ymm0
- vbroadcasti128 (%rcx),%ymm1
- vbroadcasti128 16(%rcx),%ymm2
- vbroadcasti128 (%r8),%ymm3
-
- vmovdqa32 %ymm0,%ymm16
- vmovdqa32 %ymm1,%ymm17
- vmovdqa32 %ymm2,%ymm18
- vpaddd .Lzeroz(%rip),%ymm3,%ymm3
- vmovdqa32 .Ltwoy(%rip),%ymm20
- mov $10,%r8 # reuse %r8
- vmovdqa32 %ymm3,%ymm19
- jmp .Loop_avx512vl
-
-.align 16
-.Loop_outer_avx512vl:
- vmovdqa32 %ymm18,%ymm2
- vpaddd %ymm20,%ymm19,%ymm3
- mov $10,%r8
- vmovdqa32 %ymm3,%ymm19
- jmp .Loop_avx512vl
-
-.align 32
-.Loop_avx512vl:
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
- vpshufd $78,%ymm2,%ymm2
- vpshufd $57,%ymm1,%ymm1
- vpshufd $147,%ymm3,%ymm3
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
- vpshufd $78,%ymm2,%ymm2
- vpshufd $147,%ymm1,%ymm1
- vpshufd $57,%ymm3,%ymm3
- dec %r8
- jnz .Loop_avx512vl
- vpaddd %ymm16,%ymm0,%ymm0
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
-
- sub $64,%rdx
- jb .Ltail64_avx512vl
-
- vpxor 0x00(%rsi),%xmm0,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm1,%xmm5
- vpxor 0x20(%rsi),%xmm2,%xmm6
- vpxor 0x30(%rsi),%xmm3,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- jz .Ldone_avx512vl
-
- vextracti128 $1,%ymm0,%xmm4
- vextracti128 $1,%ymm1,%xmm5
- vextracti128 $1,%ymm2,%xmm6
- vextracti128 $1,%ymm3,%xmm7
-
- sub $64,%rdx
- jb .Ltail_avx512vl
-
- vpxor 0x00(%rsi),%xmm4,%xmm4 # xor with input
- vpxor 0x10(%rsi),%xmm5,%xmm5
- vpxor 0x20(%rsi),%xmm6,%xmm6
- vpxor 0x30(%rsi),%xmm7,%xmm7
- lea 0x40(%rsi),%rsi # inp+=64
-
- vmovdqu %xmm4,0x00(%rdi) # write output
- vmovdqu %xmm5,0x10(%rdi)
- vmovdqu %xmm6,0x20(%rdi)
- vmovdqu %xmm7,0x30(%rdi)
- lea 0x40(%rdi),%rdi # out+=64
-
- vmovdqa32 %ymm16,%ymm0
- vmovdqa32 %ymm17,%ymm1
- jnz .Loop_outer_avx512vl
-
- jmp .Ldone_avx512vl
-
-.align 16
-.Ltail64_avx512vl:
- vmovdqa %xmm0,0x00(%rsp)
- vmovdqa %xmm1,0x10(%rsp)
- vmovdqa %xmm2,0x20(%rsp)
- vmovdqa %xmm3,0x30(%rsp)
- add $64,%rdx
- jmp .Loop_tail_avx512vl
-
-.align 16
-.Ltail_avx512vl:
- vmovdqa %xmm4,0x00(%rsp)
- vmovdqa %xmm5,0x10(%rsp)
- vmovdqa %xmm6,0x20(%rsp)
- vmovdqa %xmm7,0x30(%rsp)
- add $64,%rdx
-
-.Loop_tail_avx512vl:
- movzb (%rsi,%r8),%eax
- movzb (%rsp,%r8),%ecx
- lea 1(%r8),%r8
- xor %ecx,%eax
- mov %al,-1(%rdi,%r8)
- dec %rdx
- jnz .Loop_tail_avx512vl
-
- vmovdqu32 %ymm16,0x00(%rsp)
- vmovdqu32 %ymm16,0x20(%rsp)
-
-.Ldone_avx512vl:
- vzeroall
- lea -8(%r10),%rsp
-.Lavx512vl_epilogue:
- ret
-SYM_FUNC_END(chacha20_avx512vl)
-.type chacha20_16x,@function
-.align 32
-chacha20_16x:
-.Lchacha20_16x:
- lea 8(%rsp),%r10 # frame register
- sub $64+8,%rsp
- and $-64,%rsp
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti32x4 (%r9),%zmm3 # key[0]
- vbroadcasti32x4 (%rcx),%zmm7 # key[1]
- vbroadcasti32x4 16(%rcx),%zmm11 # key[2]
- vbroadcasti32x4 (%r8),%zmm15 # key[3]
-
- vpshufd $0x00,%zmm3,%zmm0 # smash key by lanes...
- vpshufd $0x55,%zmm3,%zmm1
- vpshufd $0xaa,%zmm3,%zmm2
- vpshufd $0xff,%zmm3,%zmm3
- vmovdqa64 %zmm0,%zmm16
- vmovdqa64 %zmm1,%zmm17
- vmovdqa64 %zmm2,%zmm18
- vmovdqa64 %zmm3,%zmm19
-
- vpshufd $0x00,%zmm7,%zmm4
- vpshufd $0x55,%zmm7,%zmm5
- vpshufd $0xaa,%zmm7,%zmm6
- vpshufd $0xff,%zmm7,%zmm7
- vmovdqa64 %zmm4,%zmm20
- vmovdqa64 %zmm5,%zmm21
- vmovdqa64 %zmm6,%zmm22
- vmovdqa64 %zmm7,%zmm23
-
- vpshufd $0x00,%zmm11,%zmm8
- vpshufd $0x55,%zmm11,%zmm9
- vpshufd $0xaa,%zmm11,%zmm10
- vpshufd $0xff,%zmm11,%zmm11
- vmovdqa64 %zmm8,%zmm24
- vmovdqa64 %zmm9,%zmm25
- vmovdqa64 %zmm10,%zmm26
- vmovdqa64 %zmm11,%zmm27
-
- vpshufd $0x00,%zmm15,%zmm12
- vpshufd $0x55,%zmm15,%zmm13
- vpshufd $0xaa,%zmm15,%zmm14
- vpshufd $0xff,%zmm15,%zmm15
- vpaddd .Lincz(%rip),%zmm12,%zmm12 # don't save counters yet
- vmovdqa64 %zmm12,%zmm28
- vmovdqa64 %zmm13,%zmm29
- vmovdqa64 %zmm14,%zmm30
- vmovdqa64 %zmm15,%zmm31
-
- mov $10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop_outer16x:
- vpbroadcastd 0(%r9),%zmm0 # reload key
- vpbroadcastd 4(%r9),%zmm1
- vpbroadcastd 8(%r9),%zmm2
- vpbroadcastd 12(%r9),%zmm3
- vpaddd .Lsixteen(%rip),%zmm28,%zmm28 # next SIMD counters
- vmovdqa64 %zmm20,%zmm4
- vmovdqa64 %zmm21,%zmm5
- vmovdqa64 %zmm22,%zmm6
- vmovdqa64 %zmm23,%zmm7
- vmovdqa64 %zmm24,%zmm8
- vmovdqa64 %zmm25,%zmm9
- vmovdqa64 %zmm26,%zmm10
- vmovdqa64 %zmm27,%zmm11
- vmovdqa64 %zmm28,%zmm12
- vmovdqa64 %zmm29,%zmm13
- vmovdqa64 %zmm30,%zmm14
- vmovdqa64 %zmm31,%zmm15
-
- vmovdqa64 %zmm0,%zmm16
- vmovdqa64 %zmm1,%zmm17
- vmovdqa64 %zmm2,%zmm18
- vmovdqa64 %zmm3,%zmm19
-
- mov $10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop16x:
- vpaddd %zmm4,%zmm0,%zmm0
- vpaddd %zmm5,%zmm1,%zmm1
- vpaddd %zmm6,%zmm2,%zmm2
- vpaddd %zmm7,%zmm3,%zmm3
- vpxord %zmm0,%zmm12,%zmm12
- vpxord %zmm1,%zmm13,%zmm13
- vpxord %zmm2,%zmm14,%zmm14
- vpxord %zmm3,%zmm15,%zmm15
- vprold $16,%zmm12,%zmm12
- vprold $16,%zmm13,%zmm13
- vprold $16,%zmm14,%zmm14
- vprold $16,%zmm15,%zmm15
- vpaddd %zmm12,%zmm8,%zmm8
- vpaddd %zmm13,%zmm9,%zmm9
- vpaddd %zmm14,%zmm10,%zmm10
- vpaddd %zmm15,%zmm11,%zmm11
- vpxord %zmm8,%zmm4,%zmm4
- vpxord %zmm9,%zmm5,%zmm5
- vpxord %zmm10,%zmm6,%zmm6
- vpxord %zmm11,%zmm7,%zmm7
- vprold $12,%zmm4,%zmm4
- vprold $12,%zmm5,%zmm5
- vprold $12,%zmm6,%zmm6
- vprold $12,%zmm7,%zmm7
- vpaddd %zmm4,%zmm0,%zmm0
- vpaddd %zmm5,%zmm1,%zmm1
- vpaddd %zmm6,%zmm2,%zmm2
- vpaddd %zmm7,%zmm3,%zmm3
- vpxord %zmm0,%zmm12,%zmm12
- vpxord %zmm1,%zmm13,%zmm13
- vpxord %zmm2,%zmm14,%zmm14
- vpxord %zmm3,%zmm15,%zmm15
- vprold $8,%zmm12,%zmm12
- vprold $8,%zmm13,%zmm13
- vprold $8,%zmm14,%zmm14
- vprold $8,%zmm15,%zmm15
- vpaddd %zmm12,%zmm8,%zmm8
- vpaddd %zmm13,%zmm9,%zmm9
- vpaddd %zmm14,%zmm10,%zmm10
- vpaddd %zmm15,%zmm11,%zmm11
- vpxord %zmm8,%zmm4,%zmm4
- vpxord %zmm9,%zmm5,%zmm5
- vpxord %zmm10,%zmm6,%zmm6
- vpxord %zmm11,%zmm7,%zmm7
- vprold $7,%zmm4,%zmm4
- vprold $7,%zmm5,%zmm5
- vprold $7,%zmm6,%zmm6
- vprold $7,%zmm7,%zmm7
- vpaddd %zmm5,%zmm0,%zmm0
- vpaddd %zmm6,%zmm1,%zmm1
- vpaddd %zmm7,%zmm2,%zmm2
- vpaddd %zmm4,%zmm3,%zmm3
- vpxord %zmm0,%zmm15,%zmm15
- vpxord %zmm1,%zmm12,%zmm12
- vpxord %zmm2,%zmm13,%zmm13
- vpxord %zmm3,%zmm14,%zmm14
- vprold $16,%zmm15,%zmm15
- vprold $16,%zmm12,%zmm12
- vprold $16,%zmm13,%zmm13
- vprold $16,%zmm14,%zmm14
- vpaddd %zmm15,%zmm10,%zmm10
- vpaddd %zmm12,%zmm11,%zmm11
- vpaddd %zmm13,%zmm8,%zmm8
- vpaddd %zmm14,%zmm9,%zmm9
- vpxord %zmm10,%zmm5,%zmm5
- vpxord %zmm11,%zmm6,%zmm6
- vpxord %zmm8,%zmm7,%zmm7
- vpxord %zmm9,%zmm4,%zmm4
- vprold $12,%zmm5,%zmm5
- vprold $12,%zmm6,%zmm6
- vprold $12,%zmm7,%zmm7
- vprold $12,%zmm4,%zmm4
- vpaddd %zmm5,%zmm0,%zmm0
- vpaddd %zmm6,%zmm1,%zmm1
- vpaddd %zmm7,%zmm2,%zmm2
- vpaddd %zmm4,%zmm3,%zmm3
- vpxord %zmm0,%zmm15,%zmm15
- vpxord %zmm1,%zmm12,%zmm12
- vpxord %zmm2,%zmm13,%zmm13
- vpxord %zmm3,%zmm14,%zmm14
- vprold $8,%zmm15,%zmm15
- vprold $8,%zmm12,%zmm12
- vprold $8,%zmm13,%zmm13
- vprold $8,%zmm14,%zmm14
- vpaddd %zmm15,%zmm10,%zmm10
- vpaddd %zmm12,%zmm11,%zmm11
- vpaddd %zmm13,%zmm8,%zmm8
- vpaddd %zmm14,%zmm9,%zmm9
- vpxord %zmm10,%zmm5,%zmm5
- vpxord %zmm11,%zmm6,%zmm6
- vpxord %zmm8,%zmm7,%zmm7
- vpxord %zmm9,%zmm4,%zmm4
- vprold $7,%zmm5,%zmm5
- vprold $7,%zmm6,%zmm6
- vprold $7,%zmm7,%zmm7
- vprold $7,%zmm4,%zmm4
- dec %eax
- jnz .Loop16x
-
- vpaddd %zmm16,%zmm0,%zmm0 # accumulate key
- vpaddd %zmm17,%zmm1,%zmm1
- vpaddd %zmm18,%zmm2,%zmm2
- vpaddd %zmm19,%zmm3,%zmm3
-
- vpunpckldq %zmm1,%zmm0,%zmm18 # "de-interlace" data
- vpunpckldq %zmm3,%zmm2,%zmm19
- vpunpckhdq %zmm1,%zmm0,%zmm0
- vpunpckhdq %zmm3,%zmm2,%zmm2
- vpunpcklqdq %zmm19,%zmm18,%zmm1 # "a0"
- vpunpckhqdq %zmm19,%zmm18,%zmm18 # "a1"
- vpunpcklqdq %zmm2,%zmm0,%zmm3 # "a2"
- vpunpckhqdq %zmm2,%zmm0,%zmm0 # "a3"
- vpaddd %zmm20,%zmm4,%zmm4
- vpaddd %zmm21,%zmm5,%zmm5
- vpaddd %zmm22,%zmm6,%zmm6
- vpaddd %zmm23,%zmm7,%zmm7
-
- vpunpckldq %zmm5,%zmm4,%zmm2
- vpunpckldq %zmm7,%zmm6,%zmm19
- vpunpckhdq %zmm5,%zmm4,%zmm4
- vpunpckhdq %zmm7,%zmm6,%zmm6
- vpunpcklqdq %zmm19,%zmm2,%zmm5 # "b0"
- vpunpckhqdq %zmm19,%zmm2,%zmm2 # "b1"
- vpunpcklqdq %zmm6,%zmm4,%zmm7 # "b2"
- vpunpckhqdq %zmm6,%zmm4,%zmm4 # "b3"
- vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 # "de-interlace" further
- vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5
- vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1
- vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2
- vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18
- vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7
- vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3
- vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4
- vpaddd %zmm24,%zmm8,%zmm8
- vpaddd %zmm25,%zmm9,%zmm9
- vpaddd %zmm26,%zmm10,%zmm10
- vpaddd %zmm27,%zmm11,%zmm11
-
- vpunpckldq %zmm9,%zmm8,%zmm6
- vpunpckldq %zmm11,%zmm10,%zmm0
- vpunpckhdq %zmm9,%zmm8,%zmm8
- vpunpckhdq %zmm11,%zmm10,%zmm10
- vpunpcklqdq %zmm0,%zmm6,%zmm9 # "c0"
- vpunpckhqdq %zmm0,%zmm6,%zmm6 # "c1"
- vpunpcklqdq %zmm10,%zmm8,%zmm11 # "c2"
- vpunpckhqdq %zmm10,%zmm8,%zmm8 # "c3"
- vpaddd %zmm28,%zmm12,%zmm12
- vpaddd %zmm29,%zmm13,%zmm13
- vpaddd %zmm30,%zmm14,%zmm14
- vpaddd %zmm31,%zmm15,%zmm15
-
- vpunpckldq %zmm13,%zmm12,%zmm10
- vpunpckldq %zmm15,%zmm14,%zmm0
- vpunpckhdq %zmm13,%zmm12,%zmm12
- vpunpckhdq %zmm15,%zmm14,%zmm14
- vpunpcklqdq %zmm0,%zmm10,%zmm13 # "d0"
- vpunpckhqdq %zmm0,%zmm10,%zmm10 # "d1"
- vpunpcklqdq %zmm14,%zmm12,%zmm15 # "d2"
- vpunpckhqdq %zmm14,%zmm12,%zmm12 # "d3"
- vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 # "de-interlace" further
- vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13
- vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9
- vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10
- vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6
- vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15
- vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11
- vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12
- vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 # "de-interlace" further
- vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19
- vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0
- vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13
- vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17
- vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1
- vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9
- vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10
- vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14
- vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18
- vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6
- vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15
- vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8
- vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3
- vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11
- vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12
- cmp $64*16,%rdx
- jb .Ltail16x
-
- vpxord 0x00(%rsi),%zmm16,%zmm16 # xor with input
- vpxord 0x40(%rsi),%zmm17,%zmm17
- vpxord 0x80(%rsi),%zmm14,%zmm14
- vpxord 0xc0(%rsi),%zmm8,%zmm8
- vmovdqu32 %zmm16,0x00(%rdi)
- vmovdqu32 %zmm17,0x40(%rdi)
- vmovdqu32 %zmm14,0x80(%rdi)
- vmovdqu32 %zmm8,0xc0(%rdi)
-
- vpxord 0x100(%rsi),%zmm19,%zmm19
- vpxord 0x140(%rsi),%zmm1,%zmm1
- vpxord 0x180(%rsi),%zmm18,%zmm18
- vpxord 0x1c0(%rsi),%zmm3,%zmm3
- vmovdqu32 %zmm19,0x100(%rdi)
- vmovdqu32 %zmm1,0x140(%rdi)
- vmovdqu32 %zmm18,0x180(%rdi)
- vmovdqu32 %zmm3,0x1c0(%rdi)
-
- vpxord 0x200(%rsi),%zmm0,%zmm0
- vpxord 0x240(%rsi),%zmm9,%zmm9
- vpxord 0x280(%rsi),%zmm6,%zmm6
- vpxord 0x2c0(%rsi),%zmm11,%zmm11
- vmovdqu32 %zmm0,0x200(%rdi)
- vmovdqu32 %zmm9,0x240(%rdi)
- vmovdqu32 %zmm6,0x280(%rdi)
- vmovdqu32 %zmm11,0x2c0(%rdi)
-
- vpxord 0x300(%rsi),%zmm13,%zmm13
- vpxord 0x340(%rsi),%zmm10,%zmm10
- vpxord 0x380(%rsi),%zmm15,%zmm15
- vpxord 0x3c0(%rsi),%zmm12,%zmm12
- lea 0x400(%rsi),%rsi
- vmovdqu32 %zmm13,0x300(%rdi)
- vmovdqu32 %zmm10,0x340(%rdi)
- vmovdqu32 %zmm15,0x380(%rdi)
- vmovdqu32 %zmm12,0x3c0(%rdi)
- lea 0x400(%rdi),%rdi
-
- sub $64*16,%rdx
- jnz .Loop_outer16x
-
- jmp .Ldone16x
-
-.align 32
-.Ltail16x:
- xor %r9,%r9
- sub %rsi,%rdi
- cmp $64*1,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm16,%zmm16 # xor with input
- vmovdqu32 %zmm16,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm17,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*2,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm17,%zmm17
- vmovdqu32 %zmm17,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm14,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*3,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm14,%zmm14
- vmovdqu32 %zmm14,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm8,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*4,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm8,%zmm8
- vmovdqu32 %zmm8,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm19,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*5,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm19,%zmm19
- vmovdqu32 %zmm19,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm1,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*6,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm1,%zmm1
- vmovdqu32 %zmm1,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm18,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*7,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm18,%zmm18
- vmovdqu32 %zmm18,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm3,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*8,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm3,%zmm3
- vmovdqu32 %zmm3,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm0,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*9,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm0,%zmm0
- vmovdqu32 %zmm0,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm9,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*10,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm9,%zmm9
- vmovdqu32 %zmm9,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm6,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*11,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm6,%zmm6
- vmovdqu32 %zmm6,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm11,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*12,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm11,%zmm11
- vmovdqu32 %zmm11,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm13,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*13,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm13,%zmm13
- vmovdqu32 %zmm13,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm10,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*14,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm10,%zmm10
- vmovdqu32 %zmm10,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm15,%zmm16
- lea 64(%rsi),%rsi
-
- cmp $64*15,%rdx
- jb .Less_than_64_16x
- vpxord (%rsi),%zmm15,%zmm15
- vmovdqu32 %zmm15,(%rdi,%rsi)
- je .Ldone16x
- vmovdqa32 %zmm12,%zmm16
- lea 64(%rsi),%rsi
-
-.Less_than_64_16x:
- vmovdqa32 %zmm16,0x00(%rsp)
- lea (%rdi,%rsi),%rdi
- and $63,%rdx
-
-.Loop_tail16x:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail16x
-
- vpxord %zmm16,%zmm16,%zmm16
- vmovdqa32 %zmm16,0(%rsp)
-
-.Ldone16x:
- vzeroall
- lea -8(%r10),%rsp
-.L16x_epilogue:
- ret
-.size chacha20_16x,.-chacha20_16x
-.type chacha20_8xvl,@function
-.align 32
-chacha20_8xvl:
-.Lchacha20_8xvl:
- lea 8(%rsp),%r10 # frame register
- sub $64+8,%rsp
- and $-64,%rsp
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti128 (%r9),%ymm3 # key[0]
- vbroadcasti128 (%rcx),%ymm7 # key[1]
- vbroadcasti128 16(%rcx),%ymm11 # key[2]
- vbroadcasti128 (%r8),%ymm15 # key[3]
-
- vpshufd $0x00,%ymm3,%ymm0 # smash key by lanes...
- vpshufd $0x55,%ymm3,%ymm1
- vpshufd $0xaa,%ymm3,%ymm2
- vpshufd $0xff,%ymm3,%ymm3
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
-
- vpshufd $0x00,%ymm7,%ymm4
- vpshufd $0x55,%ymm7,%ymm5
- vpshufd $0xaa,%ymm7,%ymm6
- vpshufd $0xff,%ymm7,%ymm7
- vmovdqa64 %ymm4,%ymm20
- vmovdqa64 %ymm5,%ymm21
- vmovdqa64 %ymm6,%ymm22
- vmovdqa64 %ymm7,%ymm23
-
- vpshufd $0x00,%ymm11,%ymm8
- vpshufd $0x55,%ymm11,%ymm9
- vpshufd $0xaa,%ymm11,%ymm10
- vpshufd $0xff,%ymm11,%ymm11
- vmovdqa64 %ymm8,%ymm24
- vmovdqa64 %ymm9,%ymm25
- vmovdqa64 %ymm10,%ymm26
- vmovdqa64 %ymm11,%ymm27
-
- vpshufd $0x00,%ymm15,%ymm12
- vpshufd $0x55,%ymm15,%ymm13
- vpshufd $0xaa,%ymm15,%ymm14
- vpshufd $0xff,%ymm15,%ymm15
- vpaddd .Lincy(%rip),%ymm12,%ymm12 # don't save counters yet
- vmovdqa64 %ymm12,%ymm28
- vmovdqa64 %ymm13,%ymm29
- vmovdqa64 %ymm14,%ymm30
- vmovdqa64 %ymm15,%ymm31
-
- mov $10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop_outer8xvl:
- #vpbroadcastd 0(%r9),%ymm0 # reload key
- #vpbroadcastd 4(%r9),%ymm1
- vpbroadcastd 8(%r9),%ymm2
- vpbroadcastd 12(%r9),%ymm3
- vpaddd .Leight(%rip),%ymm28,%ymm28 # next SIMD counters
- vmovdqa64 %ymm20,%ymm4
- vmovdqa64 %ymm21,%ymm5
- vmovdqa64 %ymm22,%ymm6
- vmovdqa64 %ymm23,%ymm7
- vmovdqa64 %ymm24,%ymm8
- vmovdqa64 %ymm25,%ymm9
- vmovdqa64 %ymm26,%ymm10
- vmovdqa64 %ymm27,%ymm11
- vmovdqa64 %ymm28,%ymm12
- vmovdqa64 %ymm29,%ymm13
- vmovdqa64 %ymm30,%ymm14
- vmovdqa64 %ymm31,%ymm15
-
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
-
- mov $10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop8xvl:
- vpaddd %ymm4,%ymm0,%ymm0
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm7,%ymm3,%ymm3
- vpxor %ymm0,%ymm12,%ymm12
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm3,%ymm15,%ymm15
- vprold $16,%ymm12,%ymm12
- vprold $16,%ymm13,%ymm13
- vprold $16,%ymm14,%ymm14
- vprold $16,%ymm15,%ymm15
- vpaddd %ymm12,%ymm8,%ymm8
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm11,%ymm7,%ymm7
- vprold $12,%ymm4,%ymm4
- vprold $12,%ymm5,%ymm5
- vprold $12,%ymm6,%ymm6
- vprold $12,%ymm7,%ymm7
- vpaddd %ymm4,%ymm0,%ymm0
- vpaddd %ymm5,%ymm1,%ymm1
- vpaddd %ymm6,%ymm2,%ymm2
- vpaddd %ymm7,%ymm3,%ymm3
- vpxor %ymm0,%ymm12,%ymm12
- vpxor %ymm1,%ymm13,%ymm13
- vpxor %ymm2,%ymm14,%ymm14
- vpxor %ymm3,%ymm15,%ymm15
- vprold $8,%ymm12,%ymm12
- vprold $8,%ymm13,%ymm13
- vprold $8,%ymm14,%ymm14
- vprold $8,%ymm15,%ymm15
- vpaddd %ymm12,%ymm8,%ymm8
- vpaddd %ymm13,%ymm9,%ymm9
- vpaddd %ymm14,%ymm10,%ymm10
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm8,%ymm4,%ymm4
- vpxor %ymm9,%ymm5,%ymm5
- vpxor %ymm10,%ymm6,%ymm6
- vpxor %ymm11,%ymm7,%ymm7
- vprold $7,%ymm4,%ymm4
- vprold $7,%ymm5,%ymm5
- vprold $7,%ymm6,%ymm6
- vprold $7,%ymm7,%ymm7
- vpaddd %ymm5,%ymm0,%ymm0
- vpaddd %ymm6,%ymm1,%ymm1
- vpaddd %ymm7,%ymm2,%ymm2
- vpaddd %ymm4,%ymm3,%ymm3
- vpxor %ymm0,%ymm15,%ymm15
- vpxor %ymm1,%ymm12,%ymm12
- vpxor %ymm2,%ymm13,%ymm13
- vpxor %ymm3,%ymm14,%ymm14
- vprold $16,%ymm15,%ymm15
- vprold $16,%ymm12,%ymm12
- vprold $16,%ymm13,%ymm13
- vprold $16,%ymm14,%ymm14
- vpaddd %ymm15,%ymm10,%ymm10
- vpaddd %ymm12,%ymm11,%ymm11
- vpaddd %ymm13,%ymm8,%ymm8
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm10,%ymm5,%ymm5
- vpxor %ymm11,%ymm6,%ymm6
- vpxor %ymm8,%ymm7,%ymm7
- vpxor %ymm9,%ymm4,%ymm4
- vprold $12,%ymm5,%ymm5
- vprold $12,%ymm6,%ymm6
- vprold $12,%ymm7,%ymm7
- vprold $12,%ymm4,%ymm4
- vpaddd %ymm5,%ymm0,%ymm0
- vpaddd %ymm6,%ymm1,%ymm1
- vpaddd %ymm7,%ymm2,%ymm2
- vpaddd %ymm4,%ymm3,%ymm3
- vpxor %ymm0,%ymm15,%ymm15
- vpxor %ymm1,%ymm12,%ymm12
- vpxor %ymm2,%ymm13,%ymm13
- vpxor %ymm3,%ymm14,%ymm14
- vprold $8,%ymm15,%ymm15
- vprold $8,%ymm12,%ymm12
- vprold $8,%ymm13,%ymm13
- vprold $8,%ymm14,%ymm14
- vpaddd %ymm15,%ymm10,%ymm10
- vpaddd %ymm12,%ymm11,%ymm11
- vpaddd %ymm13,%ymm8,%ymm8
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm10,%ymm5,%ymm5
- vpxor %ymm11,%ymm6,%ymm6
- vpxor %ymm8,%ymm7,%ymm7
- vpxor %ymm9,%ymm4,%ymm4
- vprold $7,%ymm5,%ymm5
- vprold $7,%ymm6,%ymm6
- vprold $7,%ymm7,%ymm7
- vprold $7,%ymm4,%ymm4
- dec %eax
- jnz .Loop8xvl
-
- vpaddd %ymm16,%ymm0,%ymm0 # accumulate key
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
-
- vpunpckldq %ymm1,%ymm0,%ymm18 # "de-interlace" data
- vpunpckldq %ymm3,%ymm2,%ymm19
- vpunpckhdq %ymm1,%ymm0,%ymm0
- vpunpckhdq %ymm3,%ymm2,%ymm2
- vpunpcklqdq %ymm19,%ymm18,%ymm1 # "a0"
- vpunpckhqdq %ymm19,%ymm18,%ymm18 # "a1"
- vpunpcklqdq %ymm2,%ymm0,%ymm3 # "a2"
- vpunpckhqdq %ymm2,%ymm0,%ymm0 # "a3"
- vpaddd %ymm20,%ymm4,%ymm4
- vpaddd %ymm21,%ymm5,%ymm5
- vpaddd %ymm22,%ymm6,%ymm6
- vpaddd %ymm23,%ymm7,%ymm7
-
- vpunpckldq %ymm5,%ymm4,%ymm2
- vpunpckldq %ymm7,%ymm6,%ymm19
- vpunpckhdq %ymm5,%ymm4,%ymm4
- vpunpckhdq %ymm7,%ymm6,%ymm6
- vpunpcklqdq %ymm19,%ymm2,%ymm5 # "b0"
- vpunpckhqdq %ymm19,%ymm2,%ymm2 # "b1"
- vpunpcklqdq %ymm6,%ymm4,%ymm7 # "b2"
- vpunpckhqdq %ymm6,%ymm4,%ymm4 # "b3"
- vshufi32x4 $0,%ymm5,%ymm1,%ymm19 # "de-interlace" further
- vshufi32x4 $3,%ymm5,%ymm1,%ymm5
- vshufi32x4 $0,%ymm2,%ymm18,%ymm1
- vshufi32x4 $3,%ymm2,%ymm18,%ymm2
- vshufi32x4 $0,%ymm7,%ymm3,%ymm18
- vshufi32x4 $3,%ymm7,%ymm3,%ymm7
- vshufi32x4 $0,%ymm4,%ymm0,%ymm3
- vshufi32x4 $3,%ymm4,%ymm0,%ymm4
- vpaddd %ymm24,%ymm8,%ymm8
- vpaddd %ymm25,%ymm9,%ymm9
- vpaddd %ymm26,%ymm10,%ymm10
- vpaddd %ymm27,%ymm11,%ymm11
-
- vpunpckldq %ymm9,%ymm8,%ymm6
- vpunpckldq %ymm11,%ymm10,%ymm0
- vpunpckhdq %ymm9,%ymm8,%ymm8
- vpunpckhdq %ymm11,%ymm10,%ymm10
- vpunpcklqdq %ymm0,%ymm6,%ymm9 # "c0"
- vpunpckhqdq %ymm0,%ymm6,%ymm6 # "c1"
- vpunpcklqdq %ymm10,%ymm8,%ymm11 # "c2"
- vpunpckhqdq %ymm10,%ymm8,%ymm8 # "c3"
- vpaddd %ymm28,%ymm12,%ymm12
- vpaddd %ymm29,%ymm13,%ymm13
- vpaddd %ymm30,%ymm14,%ymm14
- vpaddd %ymm31,%ymm15,%ymm15
-
- vpunpckldq %ymm13,%ymm12,%ymm10
- vpunpckldq %ymm15,%ymm14,%ymm0
- vpunpckhdq %ymm13,%ymm12,%ymm12
- vpunpckhdq %ymm15,%ymm14,%ymm14
- vpunpcklqdq %ymm0,%ymm10,%ymm13 # "d0"
- vpunpckhqdq %ymm0,%ymm10,%ymm10 # "d1"
- vpunpcklqdq %ymm14,%ymm12,%ymm15 # "d2"
- vpunpckhqdq %ymm14,%ymm12,%ymm12 # "d3"
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 # "de-interlace" further
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
- vperm2i128 $0x20,%ymm10,%ymm6,%ymm9
- vperm2i128 $0x31,%ymm10,%ymm6,%ymm10
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm6
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm11
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
- cmp $64*8,%rdx
- jb .Ltail8xvl
-
- mov $0x80,%eax # size optimization
- vpxord 0x00(%rsi),%ymm19,%ymm19 # xor with input
- vpxor 0x20(%rsi),%ymm0,%ymm0
- vpxor 0x40(%rsi),%ymm5,%ymm5
- vpxor 0x60(%rsi),%ymm13,%ymm13
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu32 %ymm19,0x00(%rdi)
- vmovdqu %ymm0,0x20(%rdi)
- vmovdqu %ymm5,0x40(%rdi)
- vmovdqu %ymm13,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm1,%ymm1
- vpxor 0x20(%rsi),%ymm9,%ymm9
- vpxor 0x40(%rsi),%ymm2,%ymm2
- vpxor 0x60(%rsi),%ymm10,%ymm10
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu %ymm1,0x00(%rdi)
- vmovdqu %ymm9,0x20(%rdi)
- vmovdqu %ymm2,0x40(%rdi)
- vmovdqu %ymm10,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpxord 0x00(%rsi),%ymm18,%ymm18
- vpxor 0x20(%rsi),%ymm6,%ymm6
- vpxor 0x40(%rsi),%ymm7,%ymm7
- vpxor 0x60(%rsi),%ymm15,%ymm15
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu32 %ymm18,0x00(%rdi)
- vmovdqu %ymm6,0x20(%rdi)
- vmovdqu %ymm7,0x40(%rdi)
- vmovdqu %ymm15,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpxor 0x00(%rsi),%ymm3,%ymm3
- vpxor 0x20(%rsi),%ymm11,%ymm11
- vpxor 0x40(%rsi),%ymm4,%ymm4
- vpxor 0x60(%rsi),%ymm12,%ymm12
- lea (%rsi,%rax),%rsi # size optimization
- vmovdqu %ymm3,0x00(%rdi)
- vmovdqu %ymm11,0x20(%rdi)
- vmovdqu %ymm4,0x40(%rdi)
- vmovdqu %ymm12,0x60(%rdi)
- lea (%rdi,%rax),%rdi # size optimization
-
- vpbroadcastd 0(%r9),%ymm0 # reload key
- vpbroadcastd 4(%r9),%ymm1
-
- sub $64*8,%rdx
- jnz .Loop_outer8xvl
-
- jmp .Ldone8xvl
-
-.align 32
-.Ltail8xvl:
- vmovdqa64 %ymm19,%ymm8 # size optimization
- xor %r9,%r9
- sub %rsi,%rdi
- cmp $64*1,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm8,%ymm8 # xor with input
- vpxor 0x20(%rsi),%ymm0,%ymm0
- vmovdqu %ymm8,0x00(%rdi,%rsi)
- vmovdqu %ymm0,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm5,%ymm8
- vmovdqa %ymm13,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*2,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm5,%ymm5
- vpxor 0x20(%rsi),%ymm13,%ymm13
- vmovdqu %ymm5,0x00(%rdi,%rsi)
- vmovdqu %ymm13,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm1,%ymm8
- vmovdqa %ymm9,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*3,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm1,%ymm1
- vpxor 0x20(%rsi),%ymm9,%ymm9
- vmovdqu %ymm1,0x00(%rdi,%rsi)
- vmovdqu %ymm9,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm2,%ymm8
- vmovdqa %ymm10,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*4,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm2,%ymm2
- vpxor 0x20(%rsi),%ymm10,%ymm10
- vmovdqu %ymm2,0x00(%rdi,%rsi)
- vmovdqu %ymm10,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa32 %ymm18,%ymm8
- vmovdqa %ymm6,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*5,%rdx
- jb .Less_than_64_8xvl
- vpxord 0x00(%rsi),%ymm18,%ymm18
- vpxor 0x20(%rsi),%ymm6,%ymm6
- vmovdqu32 %ymm18,0x00(%rdi,%rsi)
- vmovdqu %ymm6,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm7,%ymm8
- vmovdqa %ymm15,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*6,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm7,%ymm7
- vpxor 0x20(%rsi),%ymm15,%ymm15
- vmovdqu %ymm7,0x00(%rdi,%rsi)
- vmovdqu %ymm15,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm3,%ymm8
- vmovdqa %ymm11,%ymm0
- lea 64(%rsi),%rsi
-
- cmp $64*7,%rdx
- jb .Less_than_64_8xvl
- vpxor 0x00(%rsi),%ymm3,%ymm3
- vpxor 0x20(%rsi),%ymm11,%ymm11
- vmovdqu %ymm3,0x00(%rdi,%rsi)
- vmovdqu %ymm11,0x20(%rdi,%rsi)
- je .Ldone8xvl
- vmovdqa %ymm4,%ymm8
- vmovdqa %ymm12,%ymm0
- lea 64(%rsi),%rsi
-
-.Less_than_64_8xvl:
- vmovdqa %ymm8,0x00(%rsp)
- vmovdqa %ymm0,0x20(%rsp)
- lea (%rdi,%rsi),%rdi
- and $63,%rdx
-
-.Loop_tail8xvl:
- movzb (%rsi,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1(%rdi,%r9)
- dec %rdx
- jnz .Loop_tail8xvl
-
- vpxor %ymm8,%ymm8,%ymm8
- vmovdqa %ymm8,0x00(%rsp)
- vmovdqa %ymm8,0x20(%rsp)
-
-.Ldone8xvl:
- vzeroall
- lea -8(%r10),%rsp
-.L8xvl_epilogue:
- ret
-.size chacha20_8xvl,.-chacha20_8xvl
-#endif
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c
deleted file mode 100644
index 41e2e79abb2b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm-glue.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#if defined(CONFIG_ZINC_ARCH_ARM)
-#include <asm/system_info.h>
-#include <asm/cputype.h>
-#endif
-
-asmlinkage void chacha20_arm(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void hchacha20_arm(const u32 state[16], u32 out[8]);
-asmlinkage void chacha20_neon(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-
-static bool chacha20_use_neon __ro_after_init;
-static bool *const chacha20_nobs[] __initconst = { &chacha20_use_neon };
-static void __init chacha20_fpu_init(void)
-{
-#if defined(CONFIG_ZINC_ARCH_ARM64)
- chacha20_use_neon = cpu_have_named_feature(ASIMD);
-#elif defined(CONFIG_ZINC_ARCH_ARM)
- switch (read_cpuid_part()) {
- case ARM_CPU_PART_CORTEX_A7:
- case ARM_CPU_PART_CORTEX_A5:
- /* The Cortex-A7 and Cortex-A5 do not perform well with the NEON
- * implementation but do incredibly with the scalar one and use
- * less power.
- */
- break;
- default:
- chacha20_use_neon = elf_hwcap & HWCAP_NEON;
- }
-#endif
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE ||
- PAGE_SIZE % CHACHA20_BLOCK_SIZE);
-
- for (;;) {
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && chacha20_use_neon &&
- len >= CHACHA20_BLOCK_SIZE * 3 && simd_use(simd_context)) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- chacha20_neon(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
- len -= bytes;
- if (!len)
- break;
- dst += bytes;
- src += bytes;
- simd_relax(simd_context);
- } else {
- chacha20_arm(dst, src, len, ctx->key, ctx->counter);
- ctx->counter[0] += (len + 63) / 64;
- break;
- }
- }
-
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM)) {
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
- hchacha20_arm(x, derived_key);
- return true;
- }
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
deleted file mode 100755
index 6785383ab7bb..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
+++ /dev/null
@@ -1,1227 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# December 2014
-#
-# ChaCha20 for ARMv4.
-#
-# September 2018
-#
-# Improve scalar performance per Eric Biggers' suggestion to eliminate
-# separate rotates. This requires b[0..3] and d[0..3] to be maintained
-# pre-rotated, hence odd twists prior inner loop and when accumulating
-# key material. Since amount of instructions is reduced as result, even
-# NEON performance is improved somewhat, most notably by ~9% on low-end
-# Cortex-A5/A7. Full unroll was shown to provide even better scalar
-# performance on Cortex-A5/A7, naturally at the cost of manyfold size
-# increase. We let it be. Oversized code works in benchmarks, but is not
-# necessarily optimal in real life, when it's likely to be out-of-cache
-# upon entry and evict significant part of cache upon completion.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
-#
-# Cortex-A5 14.2(*)/+160% 21.8 12.9(**)
-# Cortex-A8 10.2(*)/+190% 13.9 6.10
-# Cortex-A9 10.8(*)/+150% 14.3 6.50
-# Cortex-A15 11.0/+40% 16.0 4.90
-# Snapdragon S4 13.9(***)/+90% 13.6 4.90
-#
-# (*) most "favourable" result for aligned data on little-endian
-# processor, result for misaligned data is 10-15% lower;
-# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8%
-# better performance on Cortex-A5/A7, but not on others;
-# (***) it's 17% slower than original, trade-off is considered
-# acceptable, because of improvement on others, specifically
-# +36% on Cortex-A5/A7 and +20% on Cortex-A9;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
-my @t=map("r$_",(8..11));
-
-sub ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my $odd = $d0&1;
-my ($xc,$xc_) = (@t[0..1]);
-my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
-my @ret;
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' are permanently allocated in registers, @x[0..7],
- # while 'c's and pair of 'd's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end. If you observe 'd' column, you'll
- # notice that 15 and 13 are reused in next pair of rounds.
- # This is why these two are chosen for offloading to memory,
- # to make loads count more.
- push @ret,(
- "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')",
- "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')",
- "&eor ($xd,@x[$a0],$xd,'ror#24')",
- "&eor ($xd_,@x[$a1],$xd_,'ror#24')",
-
- "&add ($xc,$xc,$xd,'ror#16')",
- "&add ($xc_,$xc_,$xd_,'ror#16')",
- "&eor (@x[$b0],$xc, @x[$b0],'ror#13')",
- "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')",
-
- "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')",
- "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')",
- "&eor ($xd,@x[$a0],$xd,'ror#16')",
- "&eor ($xd_,@x[$a1],$xd_,'ror#16')" );
- push @ret,(
- "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd);
- push @ret,(
- "&add ($xc,$xc,$xd,'ror#24')" );
- push @ret,(
- "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
- push @ret,(
- "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd);
- push @ret,(
- "&add ($xc_,$xc_,$xd_,'ror#24')" );
- push @ret,(
- "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
- push @ret,(
- "&str ($xc,'[sp,#4*(16+$c0)]')",
- "&eor (@x[$b0],@x[$b0],$xc,'ror#12')",
- "&str ($xc_,'[sp,#4*(16+$c1)]')",
- "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" );
-
- $xd=@x[$d2] if (!$odd);
- $xd_=@x[$d3] if ($odd);
- push @ret,(
- "&ldr ($xc,'[sp,#4*(16+$c2)]')",
- "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')",
- "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
- "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')",
- "&eor ($xd,@x[$a2],$xd,'ror#24')",
- "&eor ($xd_,@x[$a3],$xd_,'ror#24')",
-
- "&add ($xc,$xc,$xd,'ror#16')",
- "&add ($xc_,$xc_,$xd_,'ror#16')",
- "&eor (@x[$b2],$xc, @x[$b2],'ror#13')",
- "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')",
-
- "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')",
- "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')",
- "&eor ($xd,@x[$a2],$xd,'ror#16')",
- "&eor ($xd_,@x[$a3],$xd_,'ror#16')",
-
- "&add ($xc,$xc,$xd,'ror#24')",
- "&add ($xc_,$xc_,$xd_,'ror#24')",
- "&eor (@x[$b2],@x[$b2],$xc,'ror#12')",
- "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" );
-
- @ret;
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define ChaCha20_ctr32 chacha20_arm_cryptogams
-# define ChaCha20_neon chacha20_neon
-#endif
-
-.text
-#if defined(__thumb2__) || defined(__clang__)
-.syntax unified
-# define ldrhsb ldrbhs
-#endif
-#if defined(__thumb2__)
-.thumb
-#else
-.code 32
-#endif
-
-.align 5
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
-.Lone:
-.long 1,0,0,0
-.Lrot8:
-.long 0x02010003,0x06050407
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.LChaCha20_ctr32
-#else
-.word -1
-#endif
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
-.LChaCha20_ctr32:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
-#if __ARM_ARCH__<7 && !defined(__thumb2__)
- sub r14,pc,#16 @ ChaCha20_ctr32
-#else
- adr r14,.LChaCha20_ctr32
-#endif
- cmp r2,#0 @ len==0?
-#ifdef __thumb2__
- itt eq
-#endif
- addeq sp,sp,#4*3
- beq .Lno_data
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- cmp r2,#192 @ test len
- bls .Lshort
- ldr r4,[r14,#-24]
- ldr r4,[r14,r4]
-# ifdef __APPLE__
- ldr r4,[r4]
-# endif
- tst r4,#ARMV7_NEON
- bne .LChaCha20_neon
-.Lshort:
-#endif
- ldmia r12,{r4-r7} @ load counter and nonce
- sub sp,sp,#4*(16) @ off-load area
- sub r14,r14,#64 @ .Lsigma
- stmdb sp!,{r4-r7} @ copy counter and nonce
- ldmia r3,{r4-r11} @ load key
- ldmia r14,{r0-r3} @ load sigma
- stmdb sp!,{r4-r11} @ copy key
- stmdb sp!,{r0-r3} @ copy sigma
- str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
- str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
- b .Loop_outer_enter
-
-.align 4
-.Loop_outer:
- ldmia sp,{r0-r9} @ load key material
- str @t[3],[sp,#4*(32+2)] @ save len
- str r12, [sp,#4*(32+1)] @ save inp
- str r14, [sp,#4*(32+0)] @ save out
-.Loop_outer_enter:
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(16+15)]
- mov @t[3],#10
- b .Loop
-
-.align 4
-.Loop:
- subs @t[3],@t[3],#1
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- bne .Loop
-
- ldr @t[3],[sp,#4*(32+2)] @ load len
-
- str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
- str @t[1], [sp,#4*(16+9)]
- str @x[12],[sp,#4*(16+12)]
- str @t[2], [sp,#4*(16+13)]
- str @x[14],[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ @x[0-7] and second half at sp+4*(16+8)
-
- cmp @t[3],#64 @ done yet?
-#ifdef __thumb2__
- itete lo
-#endif
- addlo r12,sp,#4*(0) @ shortcut or ...
- ldrhs r12,[sp,#4*(32+1)] @ ... load inp
- addlo r14,sp,#4*(0) @ shortcut or ...
- ldrhs r14,[sp,#4*(32+0)] @ ... load out
-
- ldr @t[0],[sp,#4*(0)] @ load key material
- ldr @t[1],[sp,#4*(1)]
-
-#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
-# if __ARM_ARCH__<7
- orr @t[2],r12,r14
- tst @t[2],#3 @ are input and output aligned?
- ldr @t[2],[sp,#4*(2)]
- bne .Lunaligned
- cmp @t[3],#64 @ restore flags
-# else
- ldr @t[2],[sp,#4*(2)]
-# endif
- ldr @t[3],[sp,#4*(3)]
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @x[1],@x[1],@t[1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
-
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[0],@x[0],@t[0] @ xor with input
- eorhs @x[1],@x[1],@t[1]
- add @t[0],sp,#4*(4)
- str @x[0],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[2],@x[2],@t[2]
- eorhs @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[1],[r14,#-12]
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- add @x[5],@t[1],@x[5],ror#13
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#13
- add @x[7],@t[3],@x[7],ror#13
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[4],@x[4],@t[0]
- eorhs @x[5],@x[5],@t[1]
- add @t[0],sp,#4*(8)
- str @x[4],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[6],@x[6],@t[2]
- eorhs @x[7],@x[7],@t[3]
- str @x[5],[r14,#-12]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[6],[r14,#-8]
- add @x[0],sp,#4*(16+8)
- str @x[7],[r14,#-4]
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @x[1],@x[1],@t[1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
-# ifdef __thumb2__
- itt hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[0],@x[0],@t[0]
- eorhs @x[1],@x[1],@t[1]
- add @t[0],sp,#4*(12)
- str @x[0],[r14],#16 @ store output
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[2],@x[2],@t[2]
- eorhs @x[3],@x[3],@t[3]
- str @x[1],[r14,#-12]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @x[5],@t[1],@x[5],ror#24
-# ifdef __thumb2__
- itt hi
-# endif
- addhi @t[0],@t[0],#1 @ next counter value
- strhi @t[0],[sp,#4*(12)] @ save next counter value
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[0],[r12],#16 @ load input
- ldrhs @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#24
- add @x[7],@t[3],@x[7],ror#24
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhs @t[2],[r12,#-8]
- ldrhs @t[3],[r12,#-4]
-# if __ARM_ARCH__>=6 && defined(__ARMEB__)
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[4],@x[4],@t[0]
- eorhs @x[5],@x[5],@t[1]
-# ifdef __thumb2__
- it ne
-# endif
- ldrne @t[0],[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- itt hs
-# endif
- eorhs @x[6],@x[6],@t[2]
- eorhs @x[7],@x[7],@t[3]
- str @x[4],[r14],#16 @ store output
- str @x[5],[r14,#-12]
-# ifdef __thumb2__
- it hs
-# endif
- subhs @t[3],@t[0],#64 @ len-=64
- str @x[6],[r14,#-8]
- str @x[7],[r14,#-4]
- bhi .Loop_outer
-
- beq .Ldone
-# if __ARM_ARCH__<7
- b .Ltail
-
-.align 4
-.Lunaligned: @ unaligned endian-neutral path
- cmp @t[3],#64 @ restore flags
-# endif
-#endif
-#if __ARM_ARCH__<7
- ldr @t[3],[sp,#4*(3)]
-___
-for ($i=0;$i<16;$i+=4) {
-my $j=$i&0x7;
-my $twist="";
-if ($i==4) { $twist = ",ror#13"; }
-elsif ($i==12) { $twist = ",ror#24"; }
-
-$code.=<<___ if ($i==4);
- add @x[0],sp,#4*(16+8)
-___
-$code.=<<___ if ($i==8);
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-# ifdef __thumb2__
- itt hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
-___
-$code.=<<___;
- add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material
-___
-$code.=<<___ if ($i==12);
-# ifdef __thumb2__
- itt hi
-# endif
- addhi @t[0],@t[0],#1 @ next counter value
- strhi @t[0],[sp,#4*(12)] @ save next counter value
-___
-$code.=<<___;
- add @x[$j+1],@t[1],@x[$j+1]$twist
- add @x[$j+2],@t[2],@x[$j+2]$twist
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo @t[0],@t[0],@t[0] @ zero or ...
- ldrhsb @t[0],[r12],#16 @ ... load input
- eorlo @t[1],@t[1],@t[1]
- ldrhsb @t[1],[r12,#-12]
-
- add @x[$j+3],@t[3],@x[$j+3]$twist
-# ifdef __thumb2__
- itete lo
-# endif
- eorlo @t[2],@t[2],@t[2]
- ldrhsb @t[2],[r12,#-8]
- eorlo @t[3],@t[3],@t[3]
- ldrhsb @t[3],[r12,#-4]
-
- eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
- eor @x[$j+1],@t[1],@x[$j+1]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-15] @ load more input
- ldrhsb @t[1],[r12,#-11]
- eor @x[$j+2],@t[2],@x[$j+2]
- strb @x[$j+0],[r14],#16 @ store output
- eor @x[$j+3],@t[3],@x[$j+3]
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-7]
- ldrhsb @t[3],[r12,#-3]
- strb @x[$j+1],[r14,#-12]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+2],[r14,#-8]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-14] @ load more input
- ldrhsb @t[1],[r12,#-10]
- strb @x[$j+3],[r14,#-4]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+0],[r14,#-15]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-6]
- ldrhsb @t[3],[r12,#-2]
- strb @x[$j+1],[r14,#-11]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+2],[r14,#-7]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[0],[r12,#-13] @ load more input
- ldrhsb @t[1],[r12,#-9]
- strb @x[$j+3],[r14,#-3]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+0],[r14,#-14]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
-# ifdef __thumb2__
- itt hs
-# endif
- ldrhsb @t[2],[r12,#-5]
- ldrhsb @t[3],[r12,#-1]
- strb @x[$j+1],[r14,#-10]
- strb @x[$j+2],[r14,#-6]
- eor @x[$j+0],@t[0],@x[$j+0],lsr#8
- strb @x[$j+3],[r14,#-2]
- eor @x[$j+1],@t[1],@x[$j+1],lsr#8
- strb @x[$j+0],[r14,#-13]
- eor @x[$j+2],@t[2],@x[$j+2],lsr#8
- strb @x[$j+1],[r14,#-9]
- eor @x[$j+3],@t[3],@x[$j+3],lsr#8
- strb @x[$j+2],[r14,#-5]
- strb @x[$j+3],[r14,#-1]
-___
-$code.=<<___ if ($i<12);
- add @t[0],sp,#4*(4+$i)
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-___
-}
-$code.=<<___;
-# ifdef __thumb2__
- it ne
-# endif
- ldrne @t[0],[sp,#4*(32+2)] @ re-load len
-# ifdef __thumb2__
- it hs
-# endif
- subhs @t[3],@t[0],#64 @ len-=64
- bhi .Loop_outer
-
- beq .Ldone
-#endif
-
-.Ltail:
- ldr r12,[sp,#4*(32+1)] @ load inp
- add @t[1],sp,#4*(0)
- ldr r14,[sp,#4*(32+0)] @ load out
-
-.Loop_tail:
- ldrb @t[2],[@t[1]],#1 @ read buffer on stack
- ldrb @t[3],[r12],#1 @ read input
- subs @t[0],@t[0],#1
- eor @t[3],@t[3],@t[2]
- strb @t[3],[r14],#1 @ store output
- bne .Loop_tail
-
-.Ldone:
- add sp,sp,#4*(32+3)
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r12,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- .long 0xe12fff1e @ interoperable with Thumb ISA:-)
-#endif
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-___
-
-{{{
-my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
- map("q$_",(0..15));
-
-# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on
-# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%!
-sub vperm()
-{ my ($dst,$src,$tbl) = @_;
- $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n";
- $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n";
-}
-
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
-
- (
- "&vadd_i32 ($a,$a,$b)",
- "&veor ($d,$d,$a)",
- "&vrev32_16 ($d,$d)", # vrot ($d,16)
-
- "&vadd_i32 ($c,$c,$d)",
- "&veor ($t,$b,$c)",
- "&vshr_u32 ($b,$t,20)",
- "&vsli_32 ($b,$t,12)",
-
- "&vadd_i32 ($a,$a,$b)",
- "&veor ($t,$d,$a)",
- "&vshr_u32 ($d,$t,24)",
- "&vsli_32 ($d,$t,8)",
- #"&vperm ($d,$t,$t3)",
-
- "&vadd_i32 ($c,$c,$d)",
- "&veor ($t,$b,$c)",
- "&vshr_u32 ($b,$t,25)",
- "&vsli_32 ($b,$t,7)",
-
- "&vext_8 ($a,$a,$a,$odd?4:12)",
- "&vext_8 ($d,$d,$d,8)",
- "&vext_8 ($c,$c,$c,$odd?12:4)"
- );
-}
-
-$code.=<<___;
-#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
-.arch armv7-a
-.fpu neon
-
-# ifdef __KERNEL__
-.globl ChaCha20_neon
-@ For optimal performance it's appropriate for caller to enforce
-@ minimum input length, 193 bytes is suggested.
-# endif
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
- ldr r12,[sp,#0] @ pull pointer to counter and nonce
- stmdb sp!,{r0-r2,r4-r11,lr}
-.LChaCha20_neon:
- adr r14,.Lsigma
- vstmdb sp!,{d8-d15} @ ABI spec says so
- stmdb sp!,{r0-r3}
-
- vld1.32 {$b0-$c0},[r3] @ load key
- ldmia r3,{r4-r11} @ load key
-
- sub sp,sp,#4*(16+16)
- vld1.32 {$d0},[r12] @ load counter and nonce
- add r12,sp,#4*8
- ldmia r14,{r0-r3} @ load sigma
- vld1.32 {$a0},[r14]! @ load sigma
- vld1.32 {$t0},[r14]! @ one
- @ vld1.32 {$t3#lo},[r14] @ rot8
- vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
- vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
-
- str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
- str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
- vshl.i32 $t1#lo,$t0#lo,#1 @ two
- vstr $t0#lo,[sp,#4*(16+0)]
- vshl.i32 $t2#lo,$t0#lo,#2 @ four
- vstr $t1#lo,[sp,#4*(16+2)]
- vmov $a1,$a0
- vstr $t2#lo,[sp,#4*(16+4)]
- vmov $a2,$a0
- @ vstr $t3#lo,[sp,#4*(16+6)]
- vmov $b1,$b0
- vmov $b2,$b0
- b .Loop_neon_enter
-
-.align 4
-.Loop_neon_outer:
- ldmia sp,{r0-r9} @ load key material
- cmp @t[3],#64*2 @ if len<=64*2
- bls .Lbreak_neon @ switch to integer-only
- @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8
- vmov $a1,$a0
- str @t[3],[sp,#4*(32+2)] @ save len
- vmov $a2,$a0
- str r12, [sp,#4*(32+1)] @ save inp
- vmov $b1,$b0
- str r14, [sp,#4*(32+0)] @ save out
- vmov $b2,$b0
-.Loop_neon_enter:
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- vadd.i32 $d1,$d0,$t0 @ counter+1
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- vmov $c1,$c0
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- vmov $c2,$c0
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- vadd.i32 $d2,$d1,$t0 @ counter+2
- add @x[12],@x[12],#3 @ counter+3
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(16+15)]
- mov @t[3],#10
- b .Loop_neon
-
-.align 4
-.Loop_neon:
- subs @t[3],@t[3],#1
-___
- my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
- my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
- my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
- @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
- @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
- @thread3=&ROUND(0,5,10,15);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-$code.=<<___;
- bne .Loop_neon
-
- add @t[3],sp,#32
- vld1.32 {$t0-$t1},[sp] @ load key material
- vld1.32 {$t2-$t3},[@t[3]]
-
- ldr @t[3],[sp,#4*(32+2)] @ load len
-
- str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
- str @t[1], [sp,#4*(16+9)]
- str @x[12],[sp,#4*(16+12)]
- str @t[2], [sp,#4*(16+13)]
- str @x[14],[sp,#4*(16+14)]
-
- @ at this point we have first half of 512-bit result in
- @ @x[0-7] and second half at sp+4*(16+8)
-
- ldr r12,[sp,#4*(32+1)] @ load inp
- ldr r14,[sp,#4*(32+0)] @ load out
-
- vadd.i32 $a0,$a0,$t0 @ accumulate key material
- vadd.i32 $a1,$a1,$t0
- vadd.i32 $a2,$a2,$t0
- vldr $t0#lo,[sp,#4*(16+0)] @ one
-
- vadd.i32 $b0,$b0,$t1
- vadd.i32 $b1,$b1,$t1
- vadd.i32 $b2,$b2,$t1
- vldr $t1#lo,[sp,#4*(16+2)] @ two
-
- vadd.i32 $c0,$c0,$t2
- vadd.i32 $c1,$c1,$t2
- vadd.i32 $c2,$c2,$t2
- vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
- vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
-
- vadd.i32 $d0,$d0,$t3
- vadd.i32 $d1,$d1,$t3
- vadd.i32 $d2,$d2,$t3
-
- cmp @t[3],#64*4
- blo .Ltail_neon
-
- vld1.8 {$t0-$t1},[r12]! @ load input
- mov @t[3],sp
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0 @ xor with input
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- vst1.8 {$a0-$b0},[r14]! @ store output
- veor $b1,$b1,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c1,$c1,$t2
- vst1.8 {$c0-$d0},[r14]!
- veor $d1,$d1,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a2,$a2,$t0
- vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
- veor $t0#hi,$t0#hi,$t0#hi
- vldr $t0#lo,[sp,#4*(16+4)] @ four
- veor $b2,$b2,$t1
- vld1.32 {$c0-$d0},[@t[3]]
- veor $c2,$c2,$t2
- vst1.8 {$a1-$b1},[r14]!
- veor $d2,$d2,$t3
- vst1.8 {$c1-$d1},[r14]!
-
- vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
- vldr $t0#lo,[sp,#4*(16+0)] @ one
-
- ldmia sp,{@t[0]-@t[3]} @ load key material
- add @x[0],@x[0],@t[0] @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- vst1.8 {$a2-$b2},[r14]!
- add @x[1],@x[1],@t[1]
- ldr @t[1],[r12,#-12]
- vst1.8 {$c2-$d2},[r14]!
- add @x[2],@x[2],@t[2]
- ldr @t[2],[r12,#-8]
- add @x[3],@x[3],@t[3]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
- eor @x[0],@x[0],@t[0] @ xor with input
- add @t[0],sp,#4*(4)
- eor @x[1],@x[1],@t[1]
- str @x[0],[r14],#16 @ store output
- eor @x[2],@x[2],@t[2]
- str @x[1],[r14,#-12]
- eor @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- add @x[5],@t[1],@x[5],ror#13
- ldr @t[1],[r12,#-12]
- add @x[6],@t[2],@x[6],ror#13
- ldr @t[2],[r12,#-8]
- add @x[7],@t[3],@x[7],ror#13
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- eor @x[4],@x[4],@t[0]
- add @t[0],sp,#4*(8)
- eor @x[5],@x[5],@t[1]
- str @x[4],[r14],#16 @ store output
- eor @x[6],@x[6],@t[2]
- str @x[5],[r14,#-12]
- eor @x[7],@x[7],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[6],[r14,#-8]
- add @x[0],sp,#4*(16+8)
- str @x[7],[r14,#-4]
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- ldr @t[0],[r12],#16 @ load input
- add @x[1],@x[1],@t[1]
- ldr @t[1],[r12,#-12]
-# ifdef __thumb2__
- it hi
-# endif
- strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
- add @x[2],@x[2],@t[2]
- ldr @t[2],[r12,#-8]
-# ifdef __thumb2__
- it hi
-# endif
- strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
- add @x[3],@x[3],@t[3]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
-# endif
- eor @x[0],@x[0],@t[0]
- add @t[0],sp,#4*(12)
- eor @x[1],@x[1],@t[1]
- str @x[0],[r14],#16 @ store output
- eor @x[2],@x[2],@t[2]
- str @x[1],[r14,#-12]
- eor @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
- str @x[2],[r14,#-8]
- str @x[3],[r14,#-4]
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @t[0],@t[0],#4 @ next counter value
- add @x[5],@t[1],@x[5],ror#24
- str @t[0],[sp,#4*(12)] @ save next counter value
- ldr @t[0],[r12],#16 @ load input
- add @x[6],@t[2],@x[6],ror#24
- add @x[4],@x[4],#3 @ counter+3
- ldr @t[1],[r12,#-12]
- add @x[7],@t[3],@x[7],ror#24
- ldr @t[2],[r12,#-8]
- ldr @t[3],[r12,#-4]
-# ifdef __ARMEB__
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- eor @x[4],@x[4],@t[0]
-# ifdef __thumb2__
- it hi
-# endif
- ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
- eor @x[5],@x[5],@t[1]
- eor @x[6],@x[6],@t[2]
- str @x[4],[r14],#16 @ store output
- eor @x[7],@x[7],@t[3]
- str @x[5],[r14,#-12]
- sub @t[3],@t[0],#64*4 @ len-=64*4
- str @x[6],[r14,#-8]
- str @x[7],[r14,#-4]
- bhi .Loop_neon_outer
-
- b .Ldone_neon
-
-.align 4
-.Lbreak_neon:
- @ harmonize NEON and integer-only stack frames: load data
- @ from NEON frame, but save to integer-only one; distance
- @ between the two is 4*(32+4+16-32)=4*(20).
-
- str @t[3], [sp,#4*(20+32+2)] @ save len
- add @t[3],sp,#4*(32+4)
- str r12, [sp,#4*(20+32+1)] @ save inp
- str r14, [sp,#4*(20+32+0)] @ save out
-
- ldr @x[12],[sp,#4*(16+10)]
- ldr @x[14],[sp,#4*(16+11)]
- vldmia @t[3],{d8-d15} @ fulfill ABI requirement
- str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
- str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
-
- ldr @t[3], [sp,#4*(15)]
- mov @x[4],@x[4],ror#19 @ twist b[0..3]
- ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
- mov @x[5],@x[5],ror#19
- ldr @t[2], [sp,#4*(13)]
- mov @x[6],@x[6],ror#19
- ldr @x[14],[sp,#4*(14)]
- mov @x[7],@x[7],ror#19
- mov @t[3],@t[3],ror#8 @ twist d[0..3]
- mov @x[12],@x[12],ror#8
- mov @t[2],@t[2],ror#8
- mov @x[14],@x[14],ror#8
- str @t[3], [sp,#4*(20+16+15)]
- add @t[3],sp,#4*(20)
- vst1.32 {$a0-$b0},[@t[3]]! @ copy key
- add sp,sp,#4*(20) @ switch frame
- vst1.32 {$c0-$d0},[@t[3]]
- mov @t[3],#10
- b .Loop @ go integer-only
-
-.align 4
-.Ltail_neon:
- cmp @t[3],#64*3
- bhs .L192_or_more_neon
- cmp @t[3],#64*2
- bhs .L128_or_more_neon
- cmp @t[3],#64*1
- bhs .L64_or_more_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a0-$b0},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c0-$d0},[@t[0]]
- b .Loop_tail_neon
-
-.align 4
-.L64_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vst1.8 {$a0-$b0},[r14]!
- vst1.8 {$c0-$d0},[r14]!
-
- beq .Ldone_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a1-$b1},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c1-$d1},[@t[0]]
- sub @t[3],@t[3],#64*1 @ len-=64*1
- b .Loop_tail_neon
-
-.align 4
-.L128_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- veor $b1,$b1,$t1
- vst1.8 {$a0-$b0},[r14]!
- veor $c1,$c1,$t2
- vst1.8 {$c0-$d0},[r14]!
- veor $d1,$d1,$t3
- vst1.8 {$a1-$b1},[r14]!
- vst1.8 {$c1-$d1},[r14]!
-
- beq .Ldone_neon
-
- add @t[0],sp,#4*(8)
- vst1.8 {$a2-$b2},[sp]
- add @t[2],sp,#4*(0)
- vst1.8 {$c2-$d2},[@t[0]]
- sub @t[3],@t[3],#64*2 @ len-=64*2
- b .Loop_tail_neon
-
-.align 4
-.L192_or_more_neon:
- vld1.8 {$t0-$t1},[r12]!
- vld1.8 {$t2-$t3},[r12]!
- veor $a0,$a0,$t0
- veor $b0,$b0,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c0,$c0,$t2
- veor $d0,$d0,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a1,$a1,$t0
- veor $b1,$b1,$t1
- vld1.8 {$t0-$t1},[r12]!
- veor $c1,$c1,$t2
- vst1.8 {$a0-$b0},[r14]!
- veor $d1,$d1,$t3
- vld1.8 {$t2-$t3},[r12]!
-
- veor $a2,$a2,$t0
- vst1.8 {$c0-$d0},[r14]!
- veor $b2,$b2,$t1
- vst1.8 {$a1-$b1},[r14]!
- veor $c2,$c2,$t2
- vst1.8 {$c1-$d1},[r14]!
- veor $d2,$d2,$t3
- vst1.8 {$a2-$b2},[r14]!
- vst1.8 {$c2-$d2},[r14]!
-
- beq .Ldone_neon
-
- ldmia sp,{@t[0]-@t[3]} @ load key material
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @t[0],sp,#4*(4)
- add @x[1],@x[1],@t[1]
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-
- add @x[4],@t[0],@x[4],ror#13 @ accumulate key material
- add @t[0],sp,#4*(8)
- add @x[5],@t[1],@x[5],ror#13
- add @x[6],@t[2],@x[6],ror#13
- add @x[7],@t[3],@x[7],ror#13
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- stmia sp,{@x[0]-@x[7]}
- add @x[0],sp,#4*(16+8)
-
- ldmia @x[0],{@x[0]-@x[7]} @ load second half
-
- add @x[0],@x[0],@t[0] @ accumulate key material
- add @t[0],sp,#4*(12)
- add @x[1],@x[1],@t[1]
- add @x[2],@x[2],@t[2]
- add @x[3],@x[3],@t[3]
- ldmia @t[0],{@t[0]-@t[3]} @ load key material
-
- add @x[4],@t[0],@x[4],ror#24 @ accumulate key material
- add @t[0],sp,#4*(8)
- add @x[5],@t[1],@x[5],ror#24
- add @x[4],@x[4],#3 @ counter+3
- add @x[6],@t[2],@x[6],ror#24
- add @x[7],@t[3],@x[7],ror#24
- ldr @t[3],[sp,#4*(32+2)] @ re-load len
-# ifdef __ARMEB__
- rev @x[0],@x[0]
- rev @x[1],@x[1]
- rev @x[2],@x[2]
- rev @x[3],@x[3]
- rev @x[4],@x[4]
- rev @x[5],@x[5]
- rev @x[6],@x[6]
- rev @x[7],@x[7]
-# endif
- stmia @t[0],{@x[0]-@x[7]}
- add @t[2],sp,#4*(0)
- sub @t[3],@t[3],#64*3 @ len-=64*3
-
-.Loop_tail_neon:
- ldrb @t[0],[@t[2]],#1 @ read buffer on stack
- ldrb @t[1],[r12],#1 @ read input
- subs @t[3],@t[3],#1
- eor @t[0],@t[0],@t[1]
- strb @t[0],[r14],#1 @ store output
- bne .Loop_tail_neon
-
-.Ldone_neon:
- add sp,sp,#4*(32+4)
- vldmia sp,{d8-d15}
- add sp,sp,#4*(16+3)
- ldmia sp!,{r4-r11,pc}
-.size ChaCha20_neon,.-ChaCha20_neon
-# ifndef __KERNEL__
-.comm OPENSSL_armcap_P,4,4
-# endif
-#endif
-___
-}}}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl
deleted file mode 100755
index ac14a9924165..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm64.pl
+++ /dev/null
@@ -1,1163 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# June 2015
-#
-# ChaCha20 for ARMv8.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*)
-#
-# Apple A7 5.50/+49% 3.33 1.70
-# Cortex-A53 8.40/+80% 4.72 4.72(**)
-# Cortex-A57 8.06/+43% 4.90 4.43(***)
-# Denver 4.50/+82% 2.63 2.67(**)
-# X-Gene 9.50/+46% 8.82 8.89(**)
-# Mongoose 8.00/+44% 3.64 3.25(***)
-# Kryo 8.17/+50% 4.83 4.65(***)
-#
-# (*) since no non-Apple processor exhibits significantly better
-# performance, the code path is #ifdef __APPLE__-ed;
-# (**) it's expected that doubling interleave factor doesn't help
-# all processors, only those with higher NEON latency and
-# higher instruction issue rate;
-# (***) expected improvement was actually higher;
-
-$flavour=shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
-
-my @x=map("x$_",(5..17,19..21));
-my @d=map("x$_",(22..28,30));
-
-sub ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-
- (
- "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
- "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
- "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
- "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
- "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
- "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
- "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
- "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
- "&ror_32 (@x[$d0],@x[$d0],16)",
- "&ror_32 (@x[$d1],@x[$d1],16)",
- "&ror_32 (@x[$d2],@x[$d2],16)",
- "&ror_32 (@x[$d3],@x[$d3],16)",
-
- "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
- "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
- "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
- "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
- "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
- "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
- "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
- "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
- "&ror_32 (@x[$b0],@x[$b0],20)",
- "&ror_32 (@x[$b1],@x[$b1],20)",
- "&ror_32 (@x[$b2],@x[$b2],20)",
- "&ror_32 (@x[$b3],@x[$b3],20)",
-
- "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
- "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
- "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
- "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
- "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
- "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
- "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
- "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
- "&ror_32 (@x[$d0],@x[$d0],24)",
- "&ror_32 (@x[$d1],@x[$d1],24)",
- "&ror_32 (@x[$d2],@x[$d2],24)",
- "&ror_32 (@x[$d3],@x[$d3],24)",
-
- "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
- "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
- "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
- "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
- "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
- "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
- "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
- "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
- "&ror_32 (@x[$b0],@x[$b0],25)",
- "&ror_32 (@x[$b1],@x[$b1],25)",
- "&ror_32 (@x[$b2],@x[$b2],25)",
- "&ror_32 (@x[$b3],@x[$b3],25)"
- );
-}
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#else
-# define ChaCha20_ctr32 chacha20_arm
-# define ChaCha20_neon chacha20_neon
-#endif
-
-.text
-
-.align 5
-.Lsigma:
-.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
-.Lone:
-.long 1,0,0,0
-#ifndef __KERNEL__
-.LOPENSSL_armcap_P:
-# ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-# else
-.quad OPENSSL_armcap_P-.
-# endif
-#endif
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,%function
-.align 5
-ChaCha20_ctr32:
- cbz $len,.Labort
-#ifndef __KERNEL__
- adr @x[0],.LOPENSSL_armcap_P
- cmp $len,#192
- b.lo .Lshort
-# ifdef __ILP32__
- ldrsw @x[1],[@x[0]]
-# else
- ldr @x[1],[@x[0]]
-# endif
- ldr w17,[@x[1],@x[0]]
- tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
-
-.Lshort:
-#endif
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
- sub sp,sp,#64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ldp @d[6],@d[7],[$ctr] // load counter
-#ifdef __AARCH64EB__
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-#endif
-
-.Loop_outer:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov.32 @x[6],@d[3]
- lsr @x[7],@d[3],#32
- mov.32 @x[8],@d[4]
- lsr @x[9],@d[4],#32
- mov.32 @x[10],@d[5]
- lsr @x[11],@d[5],#32
- mov.32 @x[12],@d[6]
- lsr @x[13],@d[6],#32
- mov.32 @x[14],@d[7]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#10
- subs $len,$len,#64
-.Loop:
- sub $ctr,$ctr,#1
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- cbnz $ctr,.Loop
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add @x[1],@x[1],@d[0],lsr#32
- add.32 @x[2],@x[2],@d[1]
- add @x[3],@x[3],@d[1],lsr#32
- add.32 @x[4],@x[4],@d[2]
- add @x[5],@x[5],@d[2],lsr#32
- add.32 @x[6],@x[6],@d[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add @x[15],@x[15],@d[7],lsr#32
-
- b.lo .Ltail
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#1 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
-
- b.hi .Loop_outer
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
-.Labort:
- ret
-
-.align 4
-.Ltail:
- add $len,$len,#64
-.Less_than_64:
- sub $out,$out,#1
- add $inp,$inp,$len
- add $out,$out,$len
- add $ctr,sp,$len
- neg $len,$len
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- stp @x[0],@x[2],[sp,#0]
- stp @x[4],@x[6],[sp,#16]
- stp @x[8],@x[10],[sp,#32]
- stp @x[12],@x[14],[sp,#48]
-
-.Loop_tail:
- ldrb w10,[$inp,$len]
- ldrb w11,[$ctr,$len]
- add $len,$len,#1
- eor w10,w10,w11
- strb w10,[$out,$len]
- cbnz $len,.Loop_tail
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-___
-
-{{{
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
- map("v$_.4s",(0..7,16..23));
-my (@K)=map("v$_.4s",(24..30));
-my $ONE="v31.4s";
-
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
-
- (
- "&add ('$a','$a','$b')",
- "&eor ('$d','$d','$a')",
- "&rev32_16 ('$d','$d')", # vrot ($d,16)
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',20)",
- "&sli ('$b','$t',12)",
-
- "&add ('$a','$a','$b')",
- "&eor ('$t','$d','$a')",
- "&ushr ('$d','$t',24)",
- "&sli ('$d','$t',8)",
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',25)",
- "&sli ('$b','$t',7)",
-
- "&ext ('$a','$a','$a',$odd?4:12)",
- "&ext ('$d','$d','$d',8)",
- "&ext ('$c','$c','$c',$odd?12:4)"
- );
-}
-
-$code.=<<___;
-#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON)
-#ifdef __KERNEL__
-.globl ChaCha20_neon
-.type ChaCha20_neon,%function
-#endif
-.type ChaCha20_neon,%function
-.align 5
-ChaCha20_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-#ifdef __APPLE__
- cmp $len,#512
- b.hs .L512_or_more_neon
-#endif
-
- sub sp,sp,#64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ld1 {@K[0]},[@x[0]],#16
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ld1 {@K[1],@K[2]},[$key]
- ldp @d[6],@d[7],[$ctr] // load counter
- ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-#ifdef __AARCH64EB__
- rev64 @K[0],@K[0]
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-#endif
- add @K[3],@K[3],$ONE // += 1
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
-
-.Loop_outer_neon:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov $A0,@K[0]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov $A1,@K[0]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov $A2,@K[0]
- mov.32 @x[6],@d[3]
- mov $B0,@K[1]
- lsr @x[7],@d[3],#32
- mov $B1,@K[1]
- mov.32 @x[8],@d[4]
- mov $B2,@K[1]
- lsr @x[9],@d[4],#32
- mov $D0,@K[3]
- mov.32 @x[10],@d[5]
- mov $D1,@K[4]
- lsr @x[11],@d[5],#32
- mov $D2,@K[5]
- mov.32 @x[12],@d[6]
- mov $C0,@K[2]
- lsr @x[13],@d[6],#32
- mov $C1,@K[2]
- mov.32 @x[14],@d[7]
- mov $C2,@K[2]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#10
- subs $len,$len,#256
-.Loop_neon:
- sub $ctr,$ctr,#1
-___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&ROUND(0,5,10,15);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add $A0,$A0,@K[0]
- add @x[1],@x[1],@d[0],lsr#32
- add $A1,$A1,@K[0]
- add.32 @x[2],@x[2],@d[1]
- add $A2,$A2,@K[0]
- add @x[3],@x[3],@d[1],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[4],@x[4],@d[2]
- add $C1,$C1,@K[2]
- add @x[5],@x[5],@d[2],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[6],@x[6],@d[3]
- add $D0,$D0,@K[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add $D1,$D1,@K[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add $D2,$D2,@K[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add $B0,$B0,@K[1]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add $B1,$B1,@K[1]
- add @x[15],@x[15],@d[7],lsr#32
- add $B2,$B2,@K[1]
-
- b.lo .Ltail_neon
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- ld1.8 {$T0-$T3},[$inp],#64
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
- eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
- eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
- eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
- stp @x[4],@x[6],[$out,#16]
- add @K[3],@K[3],$ONE // += 4
- stp @x[8],@x[10],[$out,#32]
- add @K[4],@K[4],$ONE
- stp @x[12],@x[14],[$out,#48]
- add @K[5],@K[5],$ONE
- add $out,$out,#64
-
- st1.8 {$A0-$D0},[$out],#64
- ld1.8 {$A0-$D0},[$inp],#64
-
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- eor $A2,$A2,$A0
- eor $B2,$B2,$B0
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
-
- b.hi .Loop_outer_neon
-
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-
-.Ltail_neon:
- add $len,$len,#256
- cmp $len,#64
- b.lo .Less_than_64
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-#ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-#endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- b.eq .Ldone_neon
- sub $len,$len,#64
- cmp $len,#64
- b.lo .Less_than_128
-
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A0,$A0,$T0
- eor $B0,$B0,$T1
- eor $C0,$C0,$T2
- eor $D0,$D0,$T3
- st1.8 {$A0-$D0},[$out],#64
- b.eq .Ldone_neon
- sub $len,$len,#64
- cmp $len,#64
- b.lo .Less_than_192
-
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
- b.eq .Ldone_neon
- sub $len,$len,#64
-
- st1.8 {$A2-$D2},[sp]
- b .Last_neon
-
-.Less_than_128:
- st1.8 {$A0-$D0},[sp]
- b .Last_neon
-.Less_than_192:
- st1.8 {$A1-$D1},[sp]
- b .Last_neon
-
-.align 4
-.Last_neon:
- sub $out,$out,#1
- add $inp,$inp,$len
- add $out,$out,$len
- add $ctr,sp,$len
- neg $len,$len
-
-.Loop_tail_neon:
- ldrb w10,[$inp,$len]
- ldrb w11,[$ctr,$len]
- add $len,$len,#1
- eor w10,w10,w11
- strb w10,[$out,$len]
- cbnz $len,.Loop_tail_neon
-
- stp xzr,xzr,[sp,#0]
- stp xzr,xzr,[sp,#16]
- stp xzr,xzr,[sp,#32]
- stp xzr,xzr,[sp,#48]
-
-.Ldone_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_neon,.-ChaCha20_neon
-___
-{
-my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
- $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
-
-$code.=<<___;
-#ifdef __APPLE__
-.type ChaCha20_512_neon,%function
-.align 5
-ChaCha20_512_neon:
- stp x29,x30,[sp,#-96]!
- add x29,sp,#0
-
- adr @x[0],.Lsigma
- stp x19,x20,[sp,#16]
- stp x21,x22,[sp,#32]
- stp x23,x24,[sp,#48]
- stp x25,x26,[sp,#64]
- stp x27,x28,[sp,#80]
-
-.L512_or_more_neon:
- sub sp,sp,#128+64
-
- ldp @d[0],@d[1],[@x[0]] // load sigma
- ld1 {@K[0]},[@x[0]],#16
- ldp @d[2],@d[3],[$key] // load key
- ldp @d[4],@d[5],[$key,#16]
- ld1 {@K[1],@K[2]},[$key]
- ldp @d[6],@d[7],[$ctr] // load counter
- ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-# ifdef __AARCH64EB__
- rev64 @K[0],@K[0]
- ror @d[2],@d[2],#32
- ror @d[3],@d[3],#32
- ror @d[4],@d[4],#32
- ror @d[5],@d[5],#32
- ror @d[6],@d[6],#32
- ror @d[7],@d[7],#32
-# endif
- add @K[3],@K[3],$ONE // += 1
- stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
- add @K[3],@K[3],$ONE // not typo
- str @K[2],[sp,#32]
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- add @K[6],@K[5],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
-
- stp d8,d9,[sp,#128+0] // meet ABI requirements
- stp d10,d11,[sp,#128+16]
- stp d12,d13,[sp,#128+32]
- stp d14,d15,[sp,#128+48]
-
- sub $len,$len,#512 // not typo
-
-.Loop_outer_512_neon:
- mov $A0,@K[0]
- mov $A1,@K[0]
- mov $A2,@K[0]
- mov $A3,@K[0]
- mov $A4,@K[0]
- mov $A5,@K[0]
- mov $B0,@K[1]
- mov.32 @x[0],@d[0] // unpack key block
- mov $B1,@K[1]
- lsr @x[1],@d[0],#32
- mov $B2,@K[1]
- mov.32 @x[2],@d[1]
- mov $B3,@K[1]
- lsr @x[3],@d[1],#32
- mov $B4,@K[1]
- mov.32 @x[4],@d[2]
- mov $B5,@K[1]
- lsr @x[5],@d[2],#32
- mov $D0,@K[3]
- mov.32 @x[6],@d[3]
- mov $D1,@K[4]
- lsr @x[7],@d[3],#32
- mov $D2,@K[5]
- mov.32 @x[8],@d[4]
- mov $D3,@K[6]
- lsr @x[9],@d[4],#32
- mov $C0,@K[2]
- mov.32 @x[10],@d[5]
- mov $C1,@K[2]
- lsr @x[11],@d[5],#32
- add $D4,$D0,$ONE // +4
- mov.32 @x[12],@d[6]
- add $D5,$D1,$ONE // +4
- lsr @x[13],@d[6],#32
- mov $C2,@K[2]
- mov.32 @x[14],@d[7]
- mov $C3,@K[2]
- lsr @x[15],@d[7],#32
- mov $C4,@K[2]
- stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
- mov $C5,@K[2]
- str @K[5],[sp,#80]
-
- mov $ctr,#5
- subs $len,$len,#512
-.Loop_upper_neon:
- sub $ctr,$ctr,#1
-___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
- my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
- my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
- my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
- my $diff = ($#thread0+1)*6 - $#thread67 - 1;
- my $i = 0;
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_upper_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add @x[1],@x[1],@d[0],lsr#32
- add.32 @x[2],@x[2],@d[1]
- add @x[3],@x[3],@d[1],lsr#32
- add.32 @x[4],@x[4],@d[2]
- add @x[5],@x[5],@d[2],lsr#32
- add.32 @x[6],@x[6],@d[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add @x[15],@x[15],@d[7],lsr#32
-
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add @x[2],@x[2],@x[3],lsl#32
- ldp @x[1],@x[3],[$inp,#0] // load input
- add @x[4],@x[4],@x[5],lsl#32
- add @x[6],@x[6],@x[7],lsl#32
- ldp @x[5],@x[7],[$inp,#16]
- add @x[8],@x[8],@x[9],lsl#32
- add @x[10],@x[10],@x[11],lsl#32
- ldp @x[9],@x[11],[$inp,#32]
- add @x[12],@x[12],@x[13],lsl#32
- add @x[14],@x[14],@x[15],lsl#32
- ldp @x[13],@x[15],[$inp,#48]
- add $inp,$inp,#64
-# ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-# endif
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor @x[10],@x[10],@x[11]
- eor @x[12],@x[12],@x[13]
- eor @x[14],@x[14],@x[15]
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#1 // increment counter
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- stp @x[4],@x[6],[$out,#16]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- stp @x[8],@x[10],[$out,#32]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- mov.32 @x[6],@d[3]
- lsr @x[7],@d[3],#32
- mov.32 @x[8],@d[4]
- lsr @x[9],@d[4],#32
- mov.32 @x[10],@d[5]
- lsr @x[11],@d[5],#32
- mov.32 @x[12],@d[6]
- lsr @x[13],@d[6],#32
- mov.32 @x[14],@d[7]
- lsr @x[15],@d[7],#32
-
- mov $ctr,#5
-.Loop_lower_neon:
- sub $ctr,$ctr,#1
-___
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
- @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
- @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
- @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
-
- foreach (@thread0) {
- eval; eval(shift(@thread67));
- eval(shift(@thread1)); eval(shift(@thread67));
- eval(shift(@thread2)); eval(shift(@thread67));
- eval(shift(@thread3)); eval(shift(@thread67));
- eval(shift(@thread4)); eval(shift(@thread67));
- eval(shift(@thread5)); eval(shift(@thread67));
- }
-$code.=<<___;
- cbnz $ctr,.Loop_lower_neon
-
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- ldp @K[0],@K[1],[sp,#0]
- add @x[1],@x[1],@d[0],lsr#32
- ldp @K[2],@K[3],[sp,#32]
- add.32 @x[2],@x[2],@d[1]
- ldp @K[4],@K[5],[sp,#64]
- add @x[3],@x[3],@d[1],lsr#32
- add $A0,$A0,@K[0]
- add.32 @x[4],@x[4],@d[2]
- add $A1,$A1,@K[0]
- add @x[5],@x[5],@d[2],lsr#32
- add $A2,$A2,@K[0]
- add.32 @x[6],@x[6],@d[3]
- add $A3,$A3,@K[0]
- add @x[7],@x[7],@d[3],lsr#32
- add $A4,$A4,@K[0]
- add.32 @x[8],@x[8],@d[4]
- add $A5,$A5,@K[0]
- add @x[9],@x[9],@d[4],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[10],@x[10],@d[5]
- add $C1,$C1,@K[2]
- add @x[11],@x[11],@d[5],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[12],@x[12],@d[6]
- add $C3,$C3,@K[2]
- add @x[13],@x[13],@d[6],lsr#32
- add $C4,$C4,@K[2]
- add.32 @x[14],@x[14],@d[7]
- add $C5,$C5,@K[2]
- add @x[15],@x[15],@d[7],lsr#32
- add $D4,$D4,$ONE // +4
- add @x[0],@x[0],@x[1],lsl#32 // pack
- add $D5,$D5,$ONE // +4
- add @x[2],@x[2],@x[3],lsl#32
- add $D0,$D0,@K[3]
- ldp @x[1],@x[3],[$inp,#0] // load input
- add $D1,$D1,@K[4]
- add @x[4],@x[4],@x[5],lsl#32
- add $D2,$D2,@K[5]
- add @x[6],@x[6],@x[7],lsl#32
- add $D3,$D3,@K[6]
- ldp @x[5],@x[7],[$inp,#16]
- add $D4,$D4,@K[3]
- add @x[8],@x[8],@x[9],lsl#32
- add $D5,$D5,@K[4]
- add @x[10],@x[10],@x[11],lsl#32
- add $B0,$B0,@K[1]
- ldp @x[9],@x[11],[$inp,#32]
- add $B1,$B1,@K[1]
- add @x[12],@x[12],@x[13],lsl#32
- add $B2,$B2,@K[1]
- add @x[14],@x[14],@x[15],lsl#32
- add $B3,$B3,@K[1]
- ldp @x[13],@x[15],[$inp,#48]
- add $B4,$B4,@K[1]
- add $inp,$inp,#64
- add $B5,$B5,@K[1]
-
-# ifdef __AARCH64EB__
- rev @x[0],@x[0]
- rev @x[2],@x[2]
- rev @x[4],@x[4]
- rev @x[6],@x[6]
- rev @x[8],@x[8]
- rev @x[10],@x[10]
- rev @x[12],@x[12]
- rev @x[14],@x[14]
-# endif
- ld1.8 {$T0-$T3},[$inp],#64
- eor @x[0],@x[0],@x[1]
- eor @x[2],@x[2],@x[3]
- eor @x[4],@x[4],@x[5]
- eor @x[6],@x[6],@x[7]
- eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
- eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
- eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
- eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
-
- stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#7 // increment counter
- stp @x[4],@x[6],[$out,#16]
- stp @x[8],@x[10],[$out,#32]
- stp @x[12],@x[14],[$out,#48]
- add $out,$out,#64
- st1.8 {$A0-$D0},[$out],#64
-
- ld1.8 {$A0-$D0},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- ld1.8 {$A1-$D1},[$inp],#64
- eor $A2,$A2,$A0
- ldp @K[0],@K[1],[sp,#0]
- eor $B2,$B2,$B0
- ldp @K[2],@K[3],[sp,#32]
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
-
- ld1.8 {$A2-$D2},[$inp],#64
- eor $A3,$A3,$A1
- eor $B3,$B3,$B1
- eor $C3,$C3,$C1
- eor $D3,$D3,$D1
- st1.8 {$A3-$D3},[$out],#64
-
- ld1.8 {$A3-$D3},[$inp],#64
- eor $A4,$A4,$A2
- eor $B4,$B4,$B2
- eor $C4,$C4,$C2
- eor $D4,$D4,$D2
- st1.8 {$A4-$D4},[$out],#64
-
- shl $A0,$ONE,#1 // 4 -> 8
- eor $A5,$A5,$A3
- eor $B5,$B5,$B3
- eor $C5,$C5,$C3
- eor $D5,$D5,$D3
- st1.8 {$A5-$D5},[$out],#64
-
- add @K[3],@K[3],$A0 // += 8
- add @K[4],@K[4],$A0
- add @K[5],@K[5],$A0
- add @K[6],@K[6],$A0
-
- b.hs .Loop_outer_512_neon
-
- adds $len,$len,#512
- ushr $A0,$ONE,#2 // 4 -> 1
-
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
- ldp d10,d11,[sp,#128+16]
- ldp d12,d13,[sp,#128+32]
- ldp d14,d15,[sp,#128+48]
-
- stp @K[0],$ONE,[sp,#0] // wipe off-load area
- stp @K[0],$ONE,[sp,#32]
- stp @K[0],$ONE,[sp,#64]
-
- b.eq .Ldone_512_neon
-
- cmp $len,#192
- sub @K[3],@K[3],$A0 // -= 1
- sub @K[4],@K[4],$A0
- sub @K[5],@K[5],$A0
- add sp,sp,#128
- b.hs .Loop_outer_neon
-
- eor @K[1],@K[1],@K[1]
- eor @K[2],@K[2],@K[2]
- eor @K[3],@K[3],@K[3]
- eor @K[4],@K[4],@K[4]
- eor @K[5],@K[5],@K[5]
- eor @K[6],@K[6],@K[6]
- b .Loop_outer
-
-.Ldone_512_neon:
- ldp x19,x20,[x29,#16]
- add sp,sp,#128+64
- ldp x21,x22,[x29,#32]
- ldp x23,x24,[x29,#48]
- ldp x25,x26,[x29,#64]
- ldp x27,x28,[x29,#80]
- ldp x29,x30,[sp],#96
- ret
-.size ChaCha20_512_neon,.-ChaCha20_512_neon
-#endif
-#endif
-___
-}
-}}}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
- (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
- (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
- (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
- (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
-
- print $_,"\n";
-}
-close STDOUT; # flush
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c
deleted file mode 100644
index 96ce01e2c133..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips-glue.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-asmlinkage void chacha20_mips(u32 state[16], u8 *out, const u8 *in,
- const size_t len);
-static bool *const chacha20_nobs[] __initconst = { };
-static void __init chacha20_fpu_init(void)
-{
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- chacha20_mips(ctx->state, dst, src, len);
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S
deleted file mode 100644
index a81e02db95e7..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-mips.S
+++ /dev/null
@@ -1,424 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#define MASK_U32 0x3c
-#define CHACHA20_BLOCK_SIZE 64
-#define STACK_SIZE 32
-
-#define X0 $t0
-#define X1 $t1
-#define X2 $t2
-#define X3 $t3
-#define X4 $t4
-#define X5 $t5
-#define X6 $t6
-#define X7 $t7
-#define X8 $t8
-#define X9 $t9
-#define X10 $v1
-#define X11 $s6
-#define X12 $s5
-#define X13 $s4
-#define X14 $s3
-#define X15 $s2
-/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
-#define T0 $s1
-#define T1 $s0
-#define T(n) T ## n
-#define X(n) X ## n
-
-/* Input arguments */
-#define STATE $a0
-#define OUT $a1
-#define IN $a2
-#define BYTES $a3
-
-/* Output argument */
-/* NONCE[0] is kept in a register and not in memory.
- * We don't want to touch original value in memory.
- * Must be incremented every loop iteration.
- */
-#define NONCE_0 $v0
-
-/* SAVED_X and SAVED_CA are set in the jump table.
- * Use regs which are overwritten on exit else we don't leak clear data.
- * They are used to handling the last bytes which are not multiple of 4.
- */
-#define SAVED_X X15
-#define SAVED_CA $s7
-
-#define IS_UNALIGNED $s7
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define MSB 0
-#define LSB 3
-#define ROTx rotl
-#define ROTR(n) rotr n, 24
-#define CPU_TO_LE32(n) \
- wsbh n; \
- rotr n, 16;
-#else
-#define MSB 3
-#define LSB 0
-#define ROTx rotr
-#define CPU_TO_LE32(n)
-#define ROTR(n)
-#endif
-
-#define FOR_EACH_WORD(x) \
- x( 0); \
- x( 1); \
- x( 2); \
- x( 3); \
- x( 4); \
- x( 5); \
- x( 6); \
- x( 7); \
- x( 8); \
- x( 9); \
- x(10); \
- x(11); \
- x(12); \
- x(13); \
- x(14); \
- x(15);
-
-#define FOR_EACH_WORD_REV(x) \
- x(15); \
- x(14); \
- x(13); \
- x(12); \
- x(11); \
- x(10); \
- x( 9); \
- x( 8); \
- x( 7); \
- x( 6); \
- x( 5); \
- x( 4); \
- x( 3); \
- x( 2); \
- x( 1); \
- x( 0);
-
-#define PLUS_ONE_0 1
-#define PLUS_ONE_1 2
-#define PLUS_ONE_2 3
-#define PLUS_ONE_3 4
-#define PLUS_ONE_4 5
-#define PLUS_ONE_5 6
-#define PLUS_ONE_6 7
-#define PLUS_ONE_7 8
-#define PLUS_ONE_8 9
-#define PLUS_ONE_9 10
-#define PLUS_ONE_10 11
-#define PLUS_ONE_11 12
-#define PLUS_ONE_12 13
-#define PLUS_ONE_13 14
-#define PLUS_ONE_14 15
-#define PLUS_ONE_15 16
-#define PLUS_ONE(x) PLUS_ONE_ ## x
-#define _CONCAT3(a,b,c) a ## b ## c
-#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
-
-#define STORE_UNALIGNED(x) \
-CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lwl T1, (x*4)+MSB ## (IN); \
- lwr T1, (x*4)+LSB ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- swl X ## x, (x*4)+MSB ## (OUT); \
- swr X ## x, (x*4)+LSB ## (OUT);
-
-#define STORE_ALIGNED(x) \
-CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
- .if (x != 12); \
- lw T0, (x*4)(STATE); \
- .endif; \
- lw T1, (x*4) ## (IN); \
- .if (x == 12); \
- addu X ## x, NONCE_0; \
- .else; \
- addu X ## x, T0; \
- .endif; \
- CPU_TO_LE32(X ## x); \
- xor X ## x, T1; \
- sw X ## x, (x*4) ## (OUT);
-
-/* Jump table macro.
- * Used for setup and handling the last bytes, which are not multiple of 4.
- * X15 is free to store Xn
- * Every jumptable entry must be equal in size.
- */
-#define JMPTBL_ALIGNED(x) \
-.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define JMPTBL_UNALIGNED(x) \
-.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
- .set noreorder; \
- b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
- .if (x == 12); \
- addu SAVED_X, X ## x, NONCE_0; \
- .else; \
- addu SAVED_X, X ## x, SAVED_CA; \
- .endif; \
- .set reorder
-
-#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
- addu X(A), X(K); \
- addu X(B), X(L); \
- addu X(C), X(M); \
- addu X(D), X(N); \
- xor X(V), X(A); \
- xor X(W), X(B); \
- xor X(Y), X(C); \
- xor X(Z), X(D); \
- rotl X(V), S; \
- rotl X(W), S; \
- rotl X(Y), S; \
- rotl X(Z), S;
-
-.text
-.set reorder
-.set noat
-.globl chacha20_mips
-.ent chacha20_mips
-chacha20_mips:
- .frame $sp, STACK_SIZE, $ra
-
- addiu $sp, -STACK_SIZE
-
- /* Return bytes = 0. */
- beqz BYTES, .Lchacha20_mips_end
-
- lw NONCE_0, 48(STATE)
-
- /* Save s0-s7 */
- sw $s0, 0($sp)
- sw $s1, 4($sp)
- sw $s2, 8($sp)
- sw $s3, 12($sp)
- sw $s4, 16($sp)
- sw $s5, 20($sp)
- sw $s6, 24($sp)
- sw $s7, 28($sp)
-
- /* Test IN or OUT is unaligned.
- * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
- */
- or IS_UNALIGNED, IN, OUT
- andi IS_UNALIGNED, 0x3
-
- /* Set number of rounds */
- li $at, 20
-
- b .Lchacha20_rounds_start
-
-.align 4
-.Loop_chacha20_rounds:
- addiu IN, CHACHA20_BLOCK_SIZE
- addiu OUT, CHACHA20_BLOCK_SIZE
- addiu NONCE_0, 1
-
-.Lchacha20_rounds_start:
- lw X0, 0(STATE)
- lw X1, 4(STATE)
- lw X2, 8(STATE)
- lw X3, 12(STATE)
-
- lw X4, 16(STATE)
- lw X5, 20(STATE)
- lw X6, 24(STATE)
- lw X7, 28(STATE)
- lw X8, 32(STATE)
- lw X9, 36(STATE)
- lw X10, 40(STATE)
- lw X11, 44(STATE)
-
- move X12, NONCE_0
- lw X13, 52(STATE)
- lw X14, 56(STATE)
- lw X15, 60(STATE)
-
-.Loop_chacha20_xor_rounds:
- addiu $at, -2
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
- AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
- AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
- AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
- AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
- bnez $at, .Loop_chacha20_xor_rounds
-
- addiu BYTES, -(CHACHA20_BLOCK_SIZE)
-
- /* Is data src/dst unaligned? Jump */
- bnez IS_UNALIGNED, .Loop_chacha20_unaligned
-
- /* Set number rounds here to fill delayslot. */
- li $at, 20
-
- /* BYTES < 0, it has no full block. */
- bltz BYTES, .Lchacha20_mips_no_full_block_aligned
-
- FOR_EACH_WORD_REV(STORE_ALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha20_rounds
-
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
-
- /* BYTES < 0? Handle last bytes */
- bltz BYTES, .Lchacha20_mips_xor_bytes
-
-.Lchacha20_mips_xor_done:
- /* Restore used registers */
- lw $s0, 0($sp)
- lw $s1, 4($sp)
- lw $s2, 8($sp)
- lw $s3, 12($sp)
- lw $s4, 16($sp)
- lw $s5, 20($sp)
- lw $s6, 24($sp)
- lw $s7, 28($sp)
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
-.Lchacha20_mips_end:
- addiu $sp, STACK_SIZE
- jr $ra
-
-.Lchacha20_mips_no_full_block_aligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_ALIGNED)
-
-
-.Loop_chacha20_unaligned:
- /* Set number rounds here to fill delayslot. */
- li $at, 20
-
- /* BYTES > 0, it has no full block. */
- bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
-
- FOR_EACH_WORD_REV(STORE_UNALIGNED)
-
- /* BYTES > 0? Loop again. */
- bgtz BYTES, .Loop_chacha20_rounds
-
- /* Write NONCE_0 back to right location in state */
- sw NONCE_0, 48(STATE)
-
- .set noreorder
- /* Fall through to byte handling */
- bgez BYTES, .Lchacha20_mips_xor_done
-.Lchacha20_mips_xor_unaligned_0_b:
-.Lchacha20_mips_xor_aligned_0_b:
- /* Place this here to fill delay slot */
- addiu NONCE_0, 1
- .set reorder
-
-.Lchacha20_mips_xor_bytes:
- addu IN, $at
- addu OUT, $at
- /* First byte */
- lbu T1, 0(IN)
- addiu $at, BYTES, 1
- CPU_TO_LE32(SAVED_X)
- ROTR(SAVED_X)
- xor T1, SAVED_X
- sb T1, 0(OUT)
- beqz $at, .Lchacha20_mips_xor_done
- /* Second byte */
- lbu T1, 1(IN)
- addiu $at, BYTES, 2
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 1(OUT)
- beqz $at, .Lchacha20_mips_xor_done
- /* Third byte */
- lbu T1, 2(IN)
- ROTx SAVED_X, 8
- xor T1, SAVED_X
- sb T1, 2(OUT)
- b .Lchacha20_mips_xor_done
-
-.Lchacha20_mips_no_full_block_unaligned:
- /* Restore the offset on BYTES */
- addiu BYTES, CHACHA20_BLOCK_SIZE
-
- /* Get number of full WORDS */
- andi $at, BYTES, MASK_U32
-
- /* Load upper half of jump table addr */
- lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
-
- /* Calculate lower half jump table offset */
- ins T0, $at, 1, 6
-
- /* Add offset to STATE */
- addu T1, STATE, $at
-
- /* Add lower half jump table addr */
- addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
-
- /* Read value from STATE */
- lw SAVED_CA, 0(T1)
-
- /* Store remaining bytecounter as negative value */
- subu BYTES, $at, BYTES
-
- jr T0
-
- /* Jump table */
- FOR_EACH_WORD(JMPTBL_UNALIGNED)
-.end chacha20_mips
-.set at
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c
deleted file mode 100644
index 1bccec70845c..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64-glue.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-#ifdef __linux__
-#include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/intel-family.h>
-#else
-#include <sys/simd-x86_64.h>
-#endif
-
-asmlinkage void hchacha20_ssse3(u32 *derived_key, const u8 *nonce,
- const u8 *key);
-asmlinkage void chacha20_ssse3(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx2(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-asmlinkage void chacha20_avx512vl(u8 *out, const u8 *in, const size_t len,
- const u32 key[8], const u32 counter[4]);
-
-static bool chacha20_use_ssse3 __ro_after_init;
-static bool chacha20_use_avx2 __ro_after_init;
-static bool chacha20_use_avx512 __ro_after_init;
-static bool chacha20_use_avx512vl __ro_after_init;
-static bool *const chacha20_nobs[] __initconst = {
- &chacha20_use_ssse3, &chacha20_use_avx2, &chacha20_use_avx512,
- &chacha20_use_avx512vl };
-
-static void __init chacha20_fpu_init(void)
-{
-#ifdef __linux__
- chacha20_use_ssse3 = boot_cpu_has(X86_FEATURE_SSSE3);
- chacha20_use_avx2 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifndef COMPAT_CANNOT_USE_AVX512
- chacha20_use_avx512 =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm. */
- boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
- chacha20_use_avx512vl =
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL);
-#endif
-#else
- chacha20_use_ssse3 = !!(cpu_feature2 & CPUID2_SSSE3);
- chacha20_use_avx2 = !!(cpu_feature2 & CPUID2_AVX) &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX2) &&
- __ymm_enabled();
- chacha20_use_avx512 = chacha20_use_avx2 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- __zmm_enabled();
- chacha20_use_avx512vl = chacha20_use_avx512 &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) &&
- !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL);
-#endif
- if (bootverbose)
- printf("ssse3: %d avx2: %d avx512: %d avx512vl: %d\n",
- chacha20_use_ssse3,
- chacha20_use_avx2,
- chacha20_use_avx512,
- chacha20_use_avx512vl);
-}
-
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < CHACHA20_BLOCK_SIZE ||
- PAGE_SIZE % CHACHA20_BLOCK_SIZE);
-
- if (!chacha20_use_ssse3) {
- return false;
- }
- if (len <= CHACHA20_BLOCK_SIZE) {
- return false;
- }
- if (!simd_use(simd_context)) {
- return false;
- }
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- if (chacha20_use_avx512 &&
- len >= CHACHA20_BLOCK_SIZE * 8)
- chacha20_avx512(dst, src, bytes, ctx->key, ctx->counter);
- else if (chacha20_use_avx512vl &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx512vl(dst, src, bytes, ctx->key, ctx->counter);
- else if (chacha20_use_avx2 &&
- len >= CHACHA20_BLOCK_SIZE * 4)
- chacha20_avx2(dst, src, bytes, ctx->key, ctx->counter);
- else
- chacha20_ssse3(dst, src, bytes, ctx->key, ctx->counter);
- ctx->counter[0] += (bytes + 63) / 64;
- len -= bytes;
- if (!len)
- break;
- dst += bytes;
- src += bytes;
- simd_relax(simd_context);
- }
-
- return true;
-}
-
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- if (IS_ENABLED(CONFIG_AS_SSSE3) && chacha20_use_ssse3 &&
- simd_use(simd_context)) {
- hchacha20_ssse3(derived_key, nonce, key);
- return true;
- }
- return false;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl
deleted file mode 100755
index 29906a66b8b7..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-x86_64.pl
+++ /dev/null
@@ -1,4106 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# November 2014
-#
-# ChaCha20 for x86_64.
-#
-# December 2016
-#
-# Add AVX512F code path.
-#
-# December 2017
-#
-# Add AVX512VL code path.
-#
-# Performance in cycles per byte out of large buffer.
-#
-# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
-#
-# P4 9.48/+99% - -
-# Core2 7.83/+55% 7.90/5.76 4.35
-# Westmere 7.19/+50% 5.60/4.50 3.00
-# Sandy Bridge 8.31/+42% 5.45/4.00 2.72
-# Ivy Bridge 6.71/+46% 5.40/? 2.41
-# Haswell 5.92/+43% 5.20/3.45 2.42 1.23
-# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
-# Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
-# Knights L 11.7/- ? 9.60(iii) 0.80
-# Goldmont 10.6/+17% 5.10/3.52 3.28
-# Sledgehammer 7.28/+52% - -
-# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
-# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
-# VIA Nano 10.5/+46% 6.72/6.88 6.05
-#
-# (i) compared to older gcc 3.x one can observe >2x improvement on
-# most platforms;
-# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
-# by chacha20_poly1305_tls_cipher, results are EVP-free;
-# (iii) this is not optimal result for Atom because of MSROM
-# limitations, SSE2 can do better, but gain is considered too
-# low to justify the [maintenance] effort;
-# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
-# and 4.85 for 128-byte inputs;
-# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
-# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
-# cpb in single thread, the corresponding capability is suppressed;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-# input parameter block
-($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
-
-$code.=<<___ if $kernel;
-#include <linux/linkage.h>
-___
-
-sub declare_variable() {
- my ($name, $size, $type, $payload) = @_;
- if($kernel) {
- $code.=".section .rodata.cst$size.L$name, \"aM\", \@progbits, $size\n";
- $code.=".align $size\n";
- $code.=".L$name:\n";
- $code.=".$type $payload\n";
- } else {
- $code.=".L$name:\n";
- $code.=".$type $payload\n";
- }
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= ".align $align\n";
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-if(!$kernel) {
- $code .= ".text\n";
-}
-&declare_variable('zero', 16, 'long', '0,0,0,0');
-&declare_variable('one', 16, 'long', '1,0,0,0');
-&declare_variable('inc', 16, 'long', '0,1,2,3');
-&declare_variable('four', 16, 'long', '4,4,4,4');
-&declare_variable('incy', 32, 'long', '0,2,4,6,1,3,5,7');
-&declare_variable('eight', 32, 'long', '8,8,8,8,8,8,8,8');
-&declare_variable('rot16', 16, 'byte', '0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd');
-&declare_variable('rot24', 16, 'byte', '0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe');
-&declare_variable('twoy', 32, 'long', '2,0,0,0, 2,0,0,0');
-&declare_variable('zeroz', 64, 'long', '0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0');
-&declare_variable('fourz', 64, 'long', '4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0');
-&declare_variable('incz', 64, 'long', '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15');
-&declare_variable('sixteen', 64, 'long', '16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16');
-&declare_variable('sigma', 16, 'ascii', '"expand 32-byte k"');
-
-$code.=<<___ if !$kernel;
-.asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-___
-$code.=".text\n";
-
-sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- my $arg = pop;
- $arg = "\$$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-
-@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
- "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
-@t=("%esi","%edi");
-
-sub ROUND { # critical path is 24 cycles per round
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- # Normally instructions would be interleaved to favour in-order
- # execution. Generally out-of-order cores manage it gracefully,
- # but not this time for some reason. As in-order execution
- # cores are dying breed, old Atom is the only one around,
- # instructions are left uninterleaved. Besides, Atom is better
- # off executing 1xSSSE3 code anyway...
-
- (
- "&add (@x[$a0],@x[$b0])", # Q1
- "&xor (@x[$d0],@x[$a0])",
- "&rol (@x[$d0],16)",
- "&add (@x[$a1],@x[$b1])", # Q2
- "&xor (@x[$d1],@x[$a1])",
- "&rol (@x[$d1],16)",
-
- "&add ($xc,@x[$d0])",
- "&xor (@x[$b0],$xc)",
- "&rol (@x[$b0],12)",
- "&add ($xc_,@x[$d1])",
- "&xor (@x[$b1],$xc_)",
- "&rol (@x[$b1],12)",
-
- "&add (@x[$a0],@x[$b0])",
- "&xor (@x[$d0],@x[$a0])",
- "&rol (@x[$d0],8)",
- "&add (@x[$a1],@x[$b1])",
- "&xor (@x[$d1],@x[$a1])",
- "&rol (@x[$d1],8)",
-
- "&add ($xc,@x[$d0])",
- "&xor (@x[$b0],$xc)",
- "&rol (@x[$b0],7)",
- "&add ($xc_,@x[$d1])",
- "&xor (@x[$b1],$xc_)",
- "&rol (@x[$b1],7)",
-
- "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
- "&mov (\"4*$c1(%rsp)\",$xc_)",
- "&mov ($xc,\"4*$c2(%rsp)\")",
- "&mov ($xc_,\"4*$c3(%rsp)\")",
-
- "&add (@x[$a2],@x[$b2])", # Q3
- "&xor (@x[$d2],@x[$a2])",
- "&rol (@x[$d2],16)",
- "&add (@x[$a3],@x[$b3])", # Q4
- "&xor (@x[$d3],@x[$a3])",
- "&rol (@x[$d3],16)",
-
- "&add ($xc,@x[$d2])",
- "&xor (@x[$b2],$xc)",
- "&rol (@x[$b2],12)",
- "&add ($xc_,@x[$d3])",
- "&xor (@x[$b3],$xc_)",
- "&rol (@x[$b3],12)",
-
- "&add (@x[$a2],@x[$b2])",
- "&xor (@x[$d2],@x[$a2])",
- "&rol (@x[$d2],8)",
- "&add (@x[$a3],@x[$b3])",
- "&xor (@x[$d3],@x[$a3])",
- "&rol (@x[$d3],8)",
-
- "&add ($xc,@x[$d2])",
- "&xor (@x[$b2],$xc)",
- "&rol (@x[$b2],7)",
- "&add ($xc_,@x[$d3])",
- "&xor (@x[$b3],$xc_)",
- "&rol (@x[$b3],7)"
- );
-}
-
-########################################################################
-# Generic code path that handles all lengths on pre-SSSE3 processors.
-if(!$kernel) {
-&declare_function("chacha20_ctr32", 64, 5);
-$code.=<<___;
-.cfi_startproc
- cmp \$0,$len
- je .Lno_data
- mov OPENSSL_ia32cap_P+4(%rip),%r9
-___
-$code.=<<___ if ($avx>2);
- bt \$48,%r9 # check for AVX512F
- jc .Lchacha20_avx512
- test %r9,%r9 # check for AVX512VL
- js .Lchacha20_avx512vl
-___
-$code.=<<___;
- test \$`1<<(41-32)`,%r9d
- jnz .Lchacha20_ssse3
-___
-$code.=<<___;
- push %rbx
-.cfi_push %rbx
- push %rbp
-.cfi_push %rbp
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- sub \$64+24,%rsp
-.cfi_adjust_cfa_offset 64+24
-.Lctr32_body:
-
- #movdqa .Lsigma(%rip),%xmm0
- movdqu ($key),%xmm1
- movdqu 16($key),%xmm2
- movdqu ($counter),%xmm3
- movdqa .Lone(%rip),%xmm4
-
- #movdqa %xmm0,4*0(%rsp) # key[0]
- movdqa %xmm1,4*4(%rsp) # key[1]
- movdqa %xmm2,4*8(%rsp) # key[2]
- movdqa %xmm3,4*12(%rsp) # key[3]
- mov $len,%rbp # reassign $len
- jmp .Loop_outer
-
-.align 32
-.Loop_outer:
- mov \$0x61707865,@x[0] # 'expa'
- mov \$0x3320646e,@x[1] # 'nd 3'
- mov \$0x79622d32,@x[2] # '2-by'
- mov \$0x6b206574,@x[3] # 'te k'
- mov 4*4(%rsp),@x[4]
- mov 4*5(%rsp),@x[5]
- mov 4*6(%rsp),@x[6]
- mov 4*7(%rsp),@x[7]
- movd %xmm3,@x[12]
- mov 4*13(%rsp),@x[13]
- mov 4*14(%rsp),@x[14]
- mov 4*15(%rsp),@x[15]
-
- mov %rbp,64+0(%rsp) # save len
- mov \$10,%ebp
- mov $inp,64+8(%rsp) # save inp
- movq %xmm2,%rsi # "@x[8]"
- mov $out,64+16(%rsp) # save out
- mov %rsi,%rdi
- shr \$32,%rdi # "@x[9]"
- jmp .Loop
-
-.align 32
-.Loop:
-___
- foreach (&ROUND (0, 4, 8,12)) { eval; }
- foreach (&ROUND (0, 5,10,15)) { eval; }
- &dec ("%ebp");
- &jnz (".Loop");
-
-$code.=<<___;
- mov @t[1],4*9(%rsp) # modulo-scheduled
- mov @t[0],4*8(%rsp)
- mov 64(%rsp),%rbp # load len
- movdqa %xmm2,%xmm1
- mov 64+8(%rsp),$inp # load inp
- paddd %xmm4,%xmm3 # increment counter
- mov 64+16(%rsp),$out # load out
-
- add \$0x61707865,@x[0] # 'expa'
- add \$0x3320646e,@x[1] # 'nd 3'
- add \$0x79622d32,@x[2] # '2-by'
- add \$0x6b206574,@x[3] # 'te k'
- add 4*4(%rsp),@x[4]
- add 4*5(%rsp),@x[5]
- add 4*6(%rsp),@x[6]
- add 4*7(%rsp),@x[7]
- add 4*12(%rsp),@x[12]
- add 4*13(%rsp),@x[13]
- add 4*14(%rsp),@x[14]
- add 4*15(%rsp),@x[15]
- paddd 4*8(%rsp),%xmm1
-
- cmp \$64,%rbp
- jb .Ltail
-
- xor 4*0($inp),@x[0] # xor with input
- xor 4*1($inp),@x[1]
- xor 4*2($inp),@x[2]
- xor 4*3($inp),@x[3]
- xor 4*4($inp),@x[4]
- xor 4*5($inp),@x[5]
- xor 4*6($inp),@x[6]
- xor 4*7($inp),@x[7]
- movdqu 4*8($inp),%xmm0
- xor 4*12($inp),@x[12]
- xor 4*13($inp),@x[13]
- xor 4*14($inp),@x[14]
- xor 4*15($inp),@x[15]
- lea 4*16($inp),$inp # inp+=64
- pxor %xmm1,%xmm0
-
- movdqa %xmm2,4*8(%rsp)
- movd %xmm3,4*12(%rsp)
-
- mov @x[0],4*0($out) # write output
- mov @x[1],4*1($out)
- mov @x[2],4*2($out)
- mov @x[3],4*3($out)
- mov @x[4],4*4($out)
- mov @x[5],4*5($out)
- mov @x[6],4*6($out)
- mov @x[7],4*7($out)
- movdqu %xmm0,4*8($out)
- mov @x[12],4*12($out)
- mov @x[13],4*13($out)
- mov @x[14],4*14($out)
- mov @x[15],4*15($out)
- lea 4*16($out),$out # out+=64
-
- sub \$64,%rbp
- jnz .Loop_outer
-
- jmp .Ldone
-
-.align 16
-.Ltail:
- mov @x[0],4*0(%rsp)
- mov @x[1],4*1(%rsp)
- xor %rbx,%rbx
- mov @x[2],4*2(%rsp)
- mov @x[3],4*3(%rsp)
- mov @x[4],4*4(%rsp)
- mov @x[5],4*5(%rsp)
- mov @x[6],4*6(%rsp)
- mov @x[7],4*7(%rsp)
- movdqa %xmm1,4*8(%rsp)
- mov @x[12],4*12(%rsp)
- mov @x[13],4*13(%rsp)
- mov @x[14],4*14(%rsp)
- mov @x[15],4*15(%rsp)
-
-.Loop_tail:
- movzb ($inp,%rbx),%eax
- movzb (%rsp,%rbx),%edx
- lea 1(%rbx),%rbx
- xor %edx,%eax
- mov %al,-1($out,%rbx)
- dec %rbp
- jnz .Loop_tail
-
-.Ldone:
- add \$64+24,%rsp
-.cfi_adjust_cfa_offset -64-24
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbp
-.cfi_restore %rbp
- pop %rbx
-.cfi_restore %rbx
-.Lno_data:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_ctr32");
-}
-
-########################################################################
-# SSSE3 code path that handles shorter lengths
-{
-my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
-
-sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
- &paddd ($a,$b);
- &pxor ($d,$a);
- &pshufb ($d,$rot16);
-
- &paddd ($c,$d);
- &pxor ($b,$c);
- &movdqa ($t,$b);
- &psrld ($b,20);
- &pslld ($t,12);
- &por ($b,$t);
-
- &paddd ($a,$b);
- &pxor ($d,$a);
- &pshufb ($d,$rot24);
-
- &paddd ($c,$d);
- &pxor ($b,$c);
- &movdqa ($t,$b);
- &psrld ($b,25);
- &pslld ($t,7);
- &por ($b,$t);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_SSSE3\n";
-}
-
-if($kernel) {
-&declare_function("hchacha20_ssse3", 32, 5);
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($len),$b
- movdqu 16($len),$c
- movdqu ($inp),$d
- # This code is only used when targeting kernel.
- # If targeting win64, xmm{6,7} preserving needs to be added.
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
- mov \$10,$counter # reuse $counter
- jmp 1f
-.align 32
-1:
-___
- &SSSE3ROUND();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &nop ();
-
- &SSSE3ROUND();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
-
- &dec ($counter);
- &jnz ("1b");
-
-$code.=<<___;
- movdqu $a, ($out)
- movdqu $d, 16($out)
- ret
-___
-&end_function("hchacha20_ssse3");
-}
-
-&declare_function("chacha20_ssse3", 32, 5);
-$code.=<<___;
-.cfi_startproc
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
-___
-$code.=<<___ if ($avx && !$kernel);
- test \$`1<<(43-32)`,%r10d
- jnz .Lchacha20_4xop # XOP is fastest even if we use 1/4
-___
-$code.=<<___;
- cmp \$128,$len # we might throw away some data,
- je .Lchacha20_128
- ja .Lchacha20_4x # but overall it won't be slower
-
-.Ldo_ssse3_after_all:
- sub \$64+$xframe,%rsp
- and \$-16,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lssse3_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($key),$b
- movdqu 16($key),$c
- movdqu ($counter),$d
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
-
- movdqa $a,0x00(%rsp)
- movdqa $b,0x10(%rsp)
- movdqa $c,0x20(%rsp)
- movdqa $d,0x30(%rsp)
- mov \$10,$counter # reuse $counter
- jmp .Loop_ssse3
-
-.align 32
-.Loop_outer_ssse3:
- movdqa .Lone(%rip),$d
- movdqa 0x00(%rsp),$a
- movdqa 0x10(%rsp),$b
- movdqa 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
- mov \$10,$counter
- movdqa $d,0x30(%rsp)
- jmp .Loop_ssse3
-
-.align 32
-.Loop_ssse3:
-___
- &SSSE3ROUND();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &nop ();
-
- &SSSE3ROUND();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
-
- &dec ($counter);
- &jnz (".Loop_ssse3");
-
-$code.=<<___;
- paddd 0x00(%rsp),$a
- paddd 0x10(%rsp),$b
- paddd 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
-
- cmp \$64,$len
- jb .Ltail_ssse3
-
- movdqu 0x00($inp),$t
- movdqu 0x10($inp),$t1
- pxor $t,$a # xor with input
- movdqu 0x20($inp),$t
- pxor $t1,$b
- movdqu 0x30($inp),$t1
- lea 0x40($inp),$inp # inp+=64
- pxor $t,$c
- pxor $t1,$d
-
- movdqu $a,0x00($out) # write output
- movdqu $b,0x10($out)
- movdqu $c,0x20($out)
- movdqu $d,0x30($out)
- lea 0x40($out),$out # out+=64
-
- sub \$64,$len
- jnz .Loop_outer_ssse3
-
- jmp .Ldone_ssse3
-
-.align 16
-.Ltail_ssse3:
- movdqa $a,0x00(%rsp)
- movdqa $b,0x10(%rsp)
- movdqa $c,0x20(%rsp)
- movdqa $d,0x30(%rsp)
- xor $counter,$counter
-
-.Loop_tail_ssse3:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_ssse3
-
-.Ldone_ssse3:
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lssse3_epilogue:
- ret
-.cfi_endproc
-___
-}
-&end_function("chacha20_ssse3");
-
-########################################################################
-# SSSE3 code path that handles 128-byte inputs
-{
-my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
-my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
-
-sub SSSE3ROUND_2x {
- &paddd ($a,$b);
- &pxor ($d,$a);
- &paddd ($a1,$b1);
- &pxor ($d1,$a1);
- &pshufb ($d,$rot16);
- &pshufb($d1,$rot16);
-
- &paddd ($c,$d);
- &paddd ($c1,$d1);
- &pxor ($b,$c);
- &pxor ($b1,$c1);
- &movdqa ($t,$b);
- &psrld ($b,20);
- &movdqa($t1,$b1);
- &pslld ($t,12);
- &psrld ($b1,20);
- &por ($b,$t);
- &pslld ($t1,12);
- &por ($b1,$t1);
-
- &paddd ($a,$b);
- &pxor ($d,$a);
- &paddd ($a1,$b1);
- &pxor ($d1,$a1);
- &pshufb ($d,$rot24);
- &pshufb($d1,$rot24);
-
- &paddd ($c,$d);
- &paddd ($c1,$d1);
- &pxor ($b,$c);
- &pxor ($b1,$c1);
- &movdqa ($t,$b);
- &psrld ($b,25);
- &movdqa($t1,$b1);
- &pslld ($t,7);
- &psrld ($b1,25);
- &por ($b,$t);
- &pslld ($t1,7);
- &por ($b1,$t1);
-}
-
-my $xframe = $win64 ? 0x68 : 8;
-
-$code.=<<___;
-.type chacha20_128,\@function,5
-.align 32
-chacha20_128:
-.cfi_startproc
-.Lchacha20_128:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-16,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x70(%r10)
- movaps %xmm7,-0x60(%r10)
- movaps %xmm8,-0x50(%r10)
- movaps %xmm9,-0x40(%r10)
- movaps %xmm10,-0x30(%r10)
- movaps %xmm11,-0x20(%r10)
-.L128_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$a
- movdqu ($key),$b
- movdqu 16($key),$c
- movdqu ($counter),$d
- movdqa .Lone(%rip),$d1
- movdqa .Lrot16(%rip),$rot16
- movdqa .Lrot24(%rip),$rot24
-
- movdqa $a,$a1
- movdqa $a,0x00(%rsp)
- movdqa $b,$b1
- movdqa $b,0x10(%rsp)
- movdqa $c,$c1
- movdqa $c,0x20(%rsp)
- paddd $d,$d1
- movdqa $d,0x30(%rsp)
- mov \$10,$counter # reuse $counter
- jmp .Loop_128
-
-.align 32
-.Loop_128:
-___
- &SSSE3ROUND_2x();
- &pshufd ($a,$a,0b10010011);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b00111001);
- &pshufd ($a1,$a1,0b10010011);
- &pshufd ($d1,$d1,0b01001110);
- &pshufd ($c1,$c1,0b00111001);
-
- &SSSE3ROUND_2x();
- &pshufd ($a,$a,0b00111001);
- &pshufd ($d,$d,0b01001110);
- &pshufd ($c,$c,0b10010011);
- &pshufd ($a1,$a1,0b00111001);
- &pshufd ($d1,$d1,0b01001110);
- &pshufd ($c1,$c1,0b10010011);
-
- &dec ($counter);
- &jnz (".Loop_128");
-
-$code.=<<___;
- paddd 0x00(%rsp),$a
- paddd 0x10(%rsp),$b
- paddd 0x20(%rsp),$c
- paddd 0x30(%rsp),$d
- paddd .Lone(%rip),$d1
- paddd 0x00(%rsp),$a1
- paddd 0x10(%rsp),$b1
- paddd 0x20(%rsp),$c1
- paddd 0x30(%rsp),$d1
-
- movdqu 0x00($inp),$t
- movdqu 0x10($inp),$t1
- pxor $t,$a # xor with input
- movdqu 0x20($inp),$t
- pxor $t1,$b
- movdqu 0x30($inp),$t1
- pxor $t,$c
- movdqu 0x40($inp),$t
- pxor $t1,$d
- movdqu 0x50($inp),$t1
- pxor $t,$a1
- movdqu 0x60($inp),$t
- pxor $t1,$b1
- movdqu 0x70($inp),$t1
- pxor $t,$c1
- pxor $t1,$d1
-
- movdqu $a,0x00($out) # write output
- movdqu $b,0x10($out)
- movdqu $c,0x20($out)
- movdqu $d,0x30($out)
- movdqu $a1,0x40($out)
- movdqu $b1,0x50($out)
- movdqu $c1,0x60($out)
- movdqu $d1,0x70($out)
-___
-$code.=<<___ if ($win64);
- movaps -0x70(%r10),%xmm6
- movaps -0x60(%r10),%xmm7
- movaps -0x50(%r10),%xmm8
- movaps -0x40(%r10),%xmm9
- movaps -0x30(%r10),%xmm10
- movaps -0x20(%r10),%xmm11
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L128_epilogue:
- ret
-.cfi_endproc
-.size chacha20_128,.-chacha20_128
-___
-}
-
-########################################################################
-# SSSE3 code path that handles longer messages.
-{
-# assign variables to favor Atom front-end
-my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
- $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
-
-sub SSSE3_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
-my @x=map("\"$_\"",@xx);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- (
- "&paddd (@x[$a0],@x[$b0])", # Q1
- "&paddd (@x[$a1],@x[$b1])", # Q2
- "&pxor (@x[$d0],@x[$a0])",
- "&pxor (@x[$d1],@x[$a1])",
- "&pshufb (@x[$d0],$t1)",
- "&pshufb (@x[$d1],$t1)",
-
- "&paddd ($xc,@x[$d0])",
- "&paddd ($xc_,@x[$d1])",
- "&pxor (@x[$b0],$xc)",
- "&pxor (@x[$b1],$xc_)",
- "&movdqa ($t0,@x[$b0])",
- "&pslld (@x[$b0],12)",
- "&psrld ($t0,20)",
- "&movdqa ($t1,@x[$b1])",
- "&pslld (@x[$b1],12)",
- "&por (@x[$b0],$t0)",
- "&psrld ($t1,20)",
- "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
- "&por (@x[$b1],$t1)",
-
- "&paddd (@x[$a0],@x[$b0])",
- "&paddd (@x[$a1],@x[$b1])",
- "&pxor (@x[$d0],@x[$a0])",
- "&pxor (@x[$d1],@x[$a1])",
- "&pshufb (@x[$d0],$t0)",
- "&pshufb (@x[$d1],$t0)",
-
- "&paddd ($xc,@x[$d0])",
- "&paddd ($xc_,@x[$d1])",
- "&pxor (@x[$b0],$xc)",
- "&pxor (@x[$b1],$xc_)",
- "&movdqa ($t1,@x[$b0])",
- "&pslld (@x[$b0],7)",
- "&psrld ($t1,25)",
- "&movdqa ($t0,@x[$b1])",
- "&pslld (@x[$b1],7)",
- "&por (@x[$b0],$t1)",
- "&psrld ($t0,25)",
- "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
- "&por (@x[$b1],$t0)",
-
- "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
- "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
- "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
- "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
-
- "&paddd (@x[$a2],@x[$b2])", # Q3
- "&paddd (@x[$a3],@x[$b3])", # Q4
- "&pxor (@x[$d2],@x[$a2])",
- "&pxor (@x[$d3],@x[$a3])",
- "&pshufb (@x[$d2],$t1)",
- "&pshufb (@x[$d3],$t1)",
-
- "&paddd ($xc,@x[$d2])",
- "&paddd ($xc_,@x[$d3])",
- "&pxor (@x[$b2],$xc)",
- "&pxor (@x[$b3],$xc_)",
- "&movdqa ($t0,@x[$b2])",
- "&pslld (@x[$b2],12)",
- "&psrld ($t0,20)",
- "&movdqa ($t1,@x[$b3])",
- "&pslld (@x[$b3],12)",
- "&por (@x[$b2],$t0)",
- "&psrld ($t1,20)",
- "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
- "&por (@x[$b3],$t1)",
-
- "&paddd (@x[$a2],@x[$b2])",
- "&paddd (@x[$a3],@x[$b3])",
- "&pxor (@x[$d2],@x[$a2])",
- "&pxor (@x[$d3],@x[$a3])",
- "&pshufb (@x[$d2],$t0)",
- "&pshufb (@x[$d3],$t0)",
-
- "&paddd ($xc,@x[$d2])",
- "&paddd ($xc_,@x[$d3])",
- "&pxor (@x[$b2],$xc)",
- "&pxor (@x[$b3],$xc_)",
- "&movdqa ($t1,@x[$b2])",
- "&pslld (@x[$b2],7)",
- "&psrld ($t1,25)",
- "&movdqa ($t0,@x[$b3])",
- "&pslld (@x[$b3],7)",
- "&por (@x[$b2],$t1)",
- "&psrld ($t0,25)",
- "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
- "&por (@x[$b3],$t0)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type chacha20_4x,\@function,5
-.align 32
-chacha20_4x:
-.cfi_startproc
-.Lchacha20_4x:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
-___
-$code.=<<___ if (!$kernel);
- mov %r9,%r11
-___
-$code.=<<___ if ($avx>1 && !$kernel);
- shr \$32,%r9 # OPENSSL_ia32cap_P+8
- test \$`1<<5`,%r9 # test AVX2
- jnz .Lchacha20_8x
-___
-$code.=<<___;
- cmp \$192,$len
- ja .Lproceed4x
-___
-$code.=<<___ if (!$kernel);
- and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
- cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
- je .Ldo_ssse3_after_all # to detect Atom
-___
-$code.=<<___;
-.Lproceed4x:
- sub \$0x140+$xframe,%rsp
- and \$-16,%rsp
-___
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x40 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x100 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x140
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L4x_body:
-___
-$code.=<<___;
- movdqa .Lsigma(%rip),$xa3 # key[0]
- movdqu ($key),$xb3 # key[1]
- movdqu 16($key),$xt3 # key[2]
- movdqu ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- pshufd \$0x55,$xa3,$xa1
- movdqa $xa0,0x40(%rsp) # ... and offload
- pshufd \$0xaa,$xa3,$xa2
- movdqa $xa1,0x50(%rsp)
- pshufd \$0xff,$xa3,$xa3
- movdqa $xa2,0x60(%rsp)
- movdqa $xa3,0x70(%rsp)
-
- pshufd \$0x00,$xb3,$xb0
- pshufd \$0x55,$xb3,$xb1
- movdqa $xb0,0x80-0x100(%rcx)
- pshufd \$0xaa,$xb3,$xb2
- movdqa $xb1,0x90-0x100(%rcx)
- pshufd \$0xff,$xb3,$xb3
- movdqa $xb2,0xa0-0x100(%rcx)
- movdqa $xb3,0xb0-0x100(%rcx)
-
- pshufd \$0x00,$xt3,$xt0 # "$xc0"
- pshufd \$0x55,$xt3,$xt1 # "$xc1"
- movdqa $xt0,0xc0-0x100(%rcx)
- pshufd \$0xaa,$xt3,$xt2 # "$xc2"
- movdqa $xt1,0xd0-0x100(%rcx)
- pshufd \$0xff,$xt3,$xt3 # "$xc3"
- movdqa $xt2,0xe0-0x100(%rcx)
- movdqa $xt3,0xf0-0x100(%rcx)
-
- pshufd \$0x00,$xd3,$xd0
- pshufd \$0x55,$xd3,$xd1
- paddd .Linc(%rip),$xd0 # don't save counters yet
- pshufd \$0xaa,$xd3,$xd2
- movdqa $xd1,0x110-0x100(%rcx)
- pshufd \$0xff,$xd3,$xd3
- movdqa $xd2,0x120-0x100(%rcx)
- movdqa $xd3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4x
-
-.align 32
-.Loop_outer4x:
- movdqa 0x40(%rsp),$xa0 # re-load smashed key
- movdqa 0x50(%rsp),$xa1
- movdqa 0x60(%rsp),$xa2
- movdqa 0x70(%rsp),$xa3
- movdqa 0x80-0x100(%rcx),$xb0
- movdqa 0x90-0x100(%rcx),$xb1
- movdqa 0xa0-0x100(%rcx),$xb2
- movdqa 0xb0-0x100(%rcx),$xb3
- movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
- movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
- movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
- movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
- movdqa 0x100-0x100(%rcx),$xd0
- movdqa 0x110-0x100(%rcx),$xd1
- movdqa 0x120-0x100(%rcx),$xd2
- movdqa 0x130-0x100(%rcx),$xd3
- paddd .Lfour(%rip),$xd0 # next SIMD counters
-
-.Loop_enter4x:
- movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
- movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
- movdqa (%r9),$xt3 # .Lrot16(%rip)
- mov \$10,%eax
- movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4x
-
-.align 32
-.Loop4x:
-___
- foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop4x
-
- paddd 0x40(%rsp),$xa0 # accumulate key material
- paddd 0x50(%rsp),$xa1
- paddd 0x60(%rsp),$xa2
- paddd 0x70(%rsp),$xa3
-
- movdqa $xa0,$xt2 # "de-interlace" data
- punpckldq $xa1,$xa0
- movdqa $xa2,$xt3
- punpckldq $xa3,$xa2
- punpckhdq $xa1,$xt2
- punpckhdq $xa3,$xt3
- movdqa $xa0,$xa1
- punpcklqdq $xa2,$xa0 # "a0"
- movdqa $xt2,$xa3
- punpcklqdq $xt3,$xt2 # "a2"
- punpckhqdq $xa2,$xa1 # "a1"
- punpckhqdq $xt3,$xa3 # "a3"
-___
- ($xa2,$xt2)=($xt2,$xa2);
-$code.=<<___;
- paddd 0x80-0x100(%rcx),$xb0
- paddd 0x90-0x100(%rcx),$xb1
- paddd 0xa0-0x100(%rcx),$xb2
- paddd 0xb0-0x100(%rcx),$xb3
-
- movdqa $xa0,0x00(%rsp) # offload $xaN
- movdqa $xa1,0x10(%rsp)
- movdqa 0x20(%rsp),$xa0 # "xc2"
- movdqa 0x30(%rsp),$xa1 # "xc3"
-
- movdqa $xb0,$xt2
- punpckldq $xb1,$xb0
- movdqa $xb2,$xt3
- punpckldq $xb3,$xb2
- punpckhdq $xb1,$xt2
- punpckhdq $xb3,$xt3
- movdqa $xb0,$xb1
- punpcklqdq $xb2,$xb0 # "b0"
- movdqa $xt2,$xb3
- punpcklqdq $xt3,$xt2 # "b2"
- punpckhqdq $xb2,$xb1 # "b1"
- punpckhqdq $xt3,$xb3 # "b3"
-___
- ($xb2,$xt2)=($xt2,$xb2);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- paddd 0xc0-0x100(%rcx),$xc0
- paddd 0xd0-0x100(%rcx),$xc1
- paddd 0xe0-0x100(%rcx),$xc2
- paddd 0xf0-0x100(%rcx),$xc3
-
- movdqa $xa2,0x20(%rsp) # keep offloading $xaN
- movdqa $xa3,0x30(%rsp)
-
- movdqa $xc0,$xt2
- punpckldq $xc1,$xc0
- movdqa $xc2,$xt3
- punpckldq $xc3,$xc2
- punpckhdq $xc1,$xt2
- punpckhdq $xc3,$xt3
- movdqa $xc0,$xc1
- punpcklqdq $xc2,$xc0 # "c0"
- movdqa $xt2,$xc3
- punpcklqdq $xt3,$xt2 # "c2"
- punpckhqdq $xc2,$xc1 # "c1"
- punpckhqdq $xt3,$xc3 # "c3"
-___
- ($xc2,$xt2)=($xt2,$xc2);
- ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
-$code.=<<___;
- paddd 0x100-0x100(%rcx),$xd0
- paddd 0x110-0x100(%rcx),$xd1
- paddd 0x120-0x100(%rcx),$xd2
- paddd 0x130-0x100(%rcx),$xd3
-
- movdqa $xd0,$xt2
- punpckldq $xd1,$xd0
- movdqa $xd2,$xt3
- punpckldq $xd3,$xd2
- punpckhdq $xd1,$xt2
- punpckhdq $xd3,$xt3
- movdqa $xd0,$xd1
- punpcklqdq $xd2,$xd0 # "d0"
- movdqa $xt2,$xd3
- punpcklqdq $xt3,$xt2 # "d2"
- punpckhqdq $xd2,$xd1 # "d1"
- punpckhqdq $xt3,$xd3 # "d3"
-___
- ($xd2,$xt2)=($xt2,$xd2);
-$code.=<<___;
- cmp \$64*4,$len
- jb .Ltail4x
-
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # size optimization
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
-
- movdqu $xt0,0x40($out)
- movdqu 0x00($inp),$xt0
- movdqu $xt1,0x50($out)
- movdqu 0x10($inp),$xt1
- movdqu $xt2,0x60($out)
- movdqu 0x20($inp),$xt2
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # size optimization
- movdqu 0x30($inp),$xt3
- pxor 0x20(%rsp),$xt0
- pxor $xb2,$xt1
- pxor $xc2,$xt2
- pxor $xd2,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # inp+=64*4
- pxor 0x30(%rsp),$xt0
- pxor $xb3,$xt1
- pxor $xc3,$xt2
- pxor $xd3,$xt3
- movdqu $xt0,0x40($out)
- movdqu $xt1,0x50($out)
- movdqu $xt2,0x60($out)
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # out+=64*4
-
- sub \$64*4,$len
- jnz .Loop_outer4x
-
- jmp .Ldone4x
-
-.Ltail4x:
- cmp \$192,$len
- jae .L192_or_more4x
- cmp \$128,$len
- jae .L128_or_more4x
- cmp \$64,$len
- jae .L64_or_more4x
-
- #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- xor %r9,%r9
- #movdqa $xt0,0x00(%rsp)
- movdqa $xb0,0x10(%rsp)
- movdqa $xc0,0x20(%rsp)
- movdqa $xd0,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L64_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
- movdqu $xt0,0x00($out)
- movdqu $xt1,0x10($out)
- movdqu $xt2,0x20($out)
- movdqu $xt3,0x30($out)
- je .Ldone4x
-
- movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x40($inp),$inp # inp+=64*1
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb1,0x10(%rsp)
- lea 0x40($out),$out # out+=64*1
- movdqa $xc1,0x20(%rsp)
- sub \$64,$len # len-=64*1
- movdqa $xd1,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L128_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
- movdqu $xt0,0x40($out)
- movdqu $xt1,0x50($out)
- movdqu $xt2,0x60($out)
- movdqu $xt3,0x70($out)
- je .Ldone4x
-
- movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x80($inp),$inp # inp+=64*2
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb2,0x10(%rsp)
- lea 0x80($out),$out # out+=64*2
- movdqa $xc2,0x20(%rsp)
- sub \$128,$len # len-=64*2
- movdqa $xd2,0x30(%rsp)
- jmp .Loop_tail4x
-
-.align 32
-.L192_or_more4x:
- movdqu 0x00($inp),$xt0 # xor with input
- movdqu 0x10($inp),$xt1
- movdqu 0x20($inp),$xt2
- movdqu 0x30($inp),$xt3
- pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- pxor $xb0,$xt1
- pxor $xc0,$xt2
- pxor $xd0,$xt3
-
- movdqu $xt0,0x00($out)
- movdqu 0x40($inp),$xt0
- movdqu $xt1,0x10($out)
- movdqu 0x50($inp),$xt1
- movdqu $xt2,0x20($out)
- movdqu 0x60($inp),$xt2
- movdqu $xt3,0x30($out)
- movdqu 0x70($inp),$xt3
- lea 0x80($inp),$inp # size optimization
- pxor 0x10(%rsp),$xt0
- pxor $xb1,$xt1
- pxor $xc1,$xt2
- pxor $xd1,$xt3
-
- movdqu $xt0,0x40($out)
- movdqu 0x00($inp),$xt0
- movdqu $xt1,0x50($out)
- movdqu 0x10($inp),$xt1
- movdqu $xt2,0x60($out)
- movdqu 0x20($inp),$xt2
- movdqu $xt3,0x70($out)
- lea 0x80($out),$out # size optimization
- movdqu 0x30($inp),$xt3
- pxor 0x20(%rsp),$xt0
- pxor $xb2,$xt1
- pxor $xc2,$xt2
- pxor $xd2,$xt3
- movdqu $xt0,0x00($out)
- movdqu $xt1,0x10($out)
- movdqu $xt2,0x20($out)
- movdqu $xt3,0x30($out)
- je .Ldone4x
-
- movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
- lea 0x40($inp),$inp # inp+=64*3
- xor %r9,%r9
- movdqa $xt0,0x00(%rsp)
- movdqa $xb3,0x10(%rsp)
- lea 0x40($out),$out # out+=64*3
- movdqa $xc3,0x20(%rsp)
- sub \$192,$len # len-=64*3
- movdqa $xd3,0x30(%rsp)
-
-.Loop_tail4x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail4x
-
-.Ldone4x:
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L4x_epilogue:
- ret
-.cfi_endproc
-.size chacha20_4x,.-chacha20_4x
-___
-}
-if($kernel) {
- $code .= "#endif\n";
-}
-
-########################################################################
-# XOP code path that handles all lengths.
-if ($avx && !$kernel) {
-# There is some "anomaly" observed depending on instructions' size or
-# alignment. If you look closely at below code you'll notice that
-# sometimes argument order varies. The order affects instruction
-# encoding by making it larger, and such fiddling gives 5% performance
-# improvement. This is on FX-4100...
-
-my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
- $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
-
-sub XOP_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vprotd (@x[$d0],@x[$d0],16)",
- "&vprotd (@x[$d1],@x[$d1],16)",
- "&vprotd (@x[$d2],@x[$d2],16)",
- "&vprotd (@x[$d3],@x[$d3],16)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
- "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
- "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
- "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
- "&vprotd (@x[$b0],@x[$b0],12)",
- "&vprotd (@x[$b1],@x[$b1],12)",
- "&vprotd (@x[$b2],@x[$b2],12)",
- "&vprotd (@x[$b3],@x[$b3],12)",
-
- "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
- "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vprotd (@x[$d0],@x[$d0],8)",
- "&vprotd (@x[$d1],@x[$d1],8)",
- "&vprotd (@x[$d2],@x[$d2],8)",
- "&vprotd (@x[$d3],@x[$d3],8)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
- "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
- "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
- "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
- "&vprotd (@x[$b0],@x[$b0],7)",
- "&vprotd (@x[$b1],@x[$b1],7)",
- "&vprotd (@x[$b2],@x[$b2],7)",
- "&vprotd (@x[$b3],@x[$b3],7)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-&declare_function("chacha20_xop", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_4xop:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- sub \$0x140+$xframe,%rsp
- and \$-16,%rsp
-___
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x40 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x100 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x140
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L4xop_body:
-___
-$code.=<<___;
- vzeroupper
-
- vmovdqa .Lsigma(%rip),$xa3 # key[0]
- vmovdqu ($key),$xb3 # key[1]
- vmovdqu 16($key),$xt3 # key[2]
- vmovdqu ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vmovdqa $xa0,0x40(%rsp) # ... and offload
- vpshufd \$0xaa,$xa3,$xa2
- vmovdqa $xa1,0x50(%rsp)
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa $xa2,0x60(%rsp)
- vmovdqa $xa3,0x70(%rsp)
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vmovdqa $xb0,0x80-0x100(%rcx)
- vpshufd \$0xaa,$xb3,$xb2
- vmovdqa $xb1,0x90-0x100(%rcx)
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa $xb2,0xa0-0x100(%rcx)
- vmovdqa $xb3,0xb0-0x100(%rcx)
-
- vpshufd \$0x00,$xt3,$xt0 # "$xc0"
- vpshufd \$0x55,$xt3,$xt1 # "$xc1"
- vmovdqa $xt0,0xc0-0x100(%rcx)
- vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
- vmovdqa $xt1,0xd0-0x100(%rcx)
- vpshufd \$0xff,$xt3,$xt3 # "$xc3"
- vmovdqa $xt2,0xe0-0x100(%rcx)
- vmovdqa $xt3,0xf0-0x100(%rcx)
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
- vpshufd \$0xaa,$xd3,$xd2
- vmovdqa $xd1,0x110-0x100(%rcx)
- vpshufd \$0xff,$xd3,$xd3
- vmovdqa $xd2,0x120-0x100(%rcx)
- vmovdqa $xd3,0x130-0x100(%rcx)
-
- jmp .Loop_enter4xop
-
-.align 32
-.Loop_outer4xop:
- vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
- vmovdqa 0x50(%rsp),$xa1
- vmovdqa 0x60(%rsp),$xa2
- vmovdqa 0x70(%rsp),$xa3
- vmovdqa 0x80-0x100(%rcx),$xb0
- vmovdqa 0x90-0x100(%rcx),$xb1
- vmovdqa 0xa0-0x100(%rcx),$xb2
- vmovdqa 0xb0-0x100(%rcx),$xb3
- vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
- vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
- vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
- vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
- vmovdqa 0x100-0x100(%rcx),$xd0
- vmovdqa 0x110-0x100(%rcx),$xd1
- vmovdqa 0x120-0x100(%rcx),$xd2
- vmovdqa 0x130-0x100(%rcx),$xd3
- vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
-
-.Loop_enter4xop:
- mov \$10,%eax
- vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
- jmp .Loop4xop
-
-.align 32
-.Loop4xop:
-___
- foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop4xop
-
- vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
- vpaddd 0x50(%rsp),$xa1,$xa1
- vpaddd 0x60(%rsp),$xa2,$xa2
- vpaddd 0x70(%rsp),$xa3,$xa3
-
- vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
- vmovdqa $xt3,0x30(%rsp)
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd 0x80-0x100(%rcx),$xb0,$xb0
- vpaddd 0x90-0x100(%rcx),$xb1,$xb1
- vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
- vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
-
- vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
- vmovdqa $xa1,0x10(%rsp)
- vmovdqa 0x20(%rsp),$xa0 # "xc2"
- vmovdqa 0x30(%rsp),$xa1 # "xc3"
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
- vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
- vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
- vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd 0x100-0x100(%rcx),$xd0,$xd0
- vpaddd 0x110-0x100(%rcx),$xd1,$xd1
- vpaddd 0x120-0x100(%rcx),$xd2,$xd2
- vpaddd 0x130-0x100(%rcx),$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
- ($xa0,$xa1)=($xt2,$xt3);
-$code.=<<___;
- vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
- vmovdqa 0x10(%rsp),$xa1
-
- cmp \$64*4,$len
- jb .Ltail4xop
-
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x10($inp),$xb2,$xb2
- vpxor 0x20($inp),$xc2,$xc2
- vpxor 0x30($inp),$xd2,$xd2
- vpxor 0x40($inp),$xa3,$xa3
- vpxor 0x50($inp),$xb3,$xb3
- vpxor 0x60($inp),$xc3,$xc3
- vpxor 0x70($inp),$xd3,$xd3
- lea 0x80($inp),$inp # inp+=64*4
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- lea 0x80($out),$out # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x10($out)
- vmovdqu $xc2,0x20($out)
- vmovdqu $xd2,0x30($out)
- vmovdqu $xa3,0x40($out)
- vmovdqu $xb3,0x50($out)
- vmovdqu $xc3,0x60($out)
- vmovdqu $xd3,0x70($out)
- lea 0x80($out),$out # out+=64*4
-
- sub \$64*4,$len
- jnz .Loop_outer4xop
-
- jmp .Ldone4xop
-
-.align 32
-.Ltail4xop:
- cmp \$192,$len
- jae .L192_or_more4xop
- cmp \$128,$len
- jae .L128_or_more4xop
- cmp \$64,$len
- jae .L64_or_more4xop
-
- xor %r9,%r9
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x10(%rsp)
- vmovdqa $xc0,0x20(%rsp)
- vmovdqa $xd0,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L64_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- je .Ldone4xop
-
- lea 0x40($inp),$inp # inp+=64*1
- vmovdqa $xa1,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb1,0x10(%rsp)
- lea 0x40($out),$out # out+=64*1
- vmovdqa $xc1,0x20(%rsp)
- sub \$64,$len # len-=64*1
- vmovdqa $xd1,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L128_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- je .Ldone4xop
-
- lea 0x80($inp),$inp # inp+=64*2
- vmovdqa $xa2,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb2,0x10(%rsp)
- lea 0x80($out),$out # out+=64*2
- vmovdqa $xc2,0x20(%rsp)
- sub \$128,$len # len-=64*2
- vmovdqa $xd2,0x30(%rsp)
- jmp .Loop_tail4xop
-
-.align 32
-.L192_or_more4xop:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x10($inp),$xb0,$xb0
- vpxor 0x20($inp),$xc0,$xc0
- vpxor 0x30($inp),$xd0,$xd0
- vpxor 0x40($inp),$xa1,$xa1
- vpxor 0x50($inp),$xb1,$xb1
- vpxor 0x60($inp),$xc1,$xc1
- vpxor 0x70($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x10($inp),$xb2,$xb2
- vpxor 0x20($inp),$xc2,$xc2
- vpxor 0x30($inp),$xd2,$xd2
-
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x10($out)
- vmovdqu $xc0,0x20($out)
- vmovdqu $xd0,0x30($out)
- vmovdqu $xa1,0x40($out)
- vmovdqu $xb1,0x50($out)
- vmovdqu $xc1,0x60($out)
- vmovdqu $xd1,0x70($out)
- lea 0x80($out),$out # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x10($out)
- vmovdqu $xc2,0x20($out)
- vmovdqu $xd2,0x30($out)
- je .Ldone4xop
-
- lea 0x40($inp),$inp # inp+=64*3
- vmovdqa $xa3,0x00(%rsp)
- xor %r9,%r9
- vmovdqa $xb3,0x10(%rsp)
- lea 0x40($out),$out # out+=64*3
- vmovdqa $xc3,0x20(%rsp)
- sub \$192,$len # len-=64*3
- vmovdqa $xd3,0x30(%rsp)
-
-.Loop_tail4xop:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail4xop
-
-.Ldone4xop:
- vzeroupper
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L4xop_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_xop");
-}
-
-########################################################################
-# AVX2 code path
-if ($avx>1) {
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX2\n";
-}
-
-my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
- $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
-
-sub AVX2_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
-my @x=map("\"$_\"",@xx);
-
- # Consider order in which variables are addressed by their
- # index:
- #
- # a b c d
- #
- # 0 4 8 12 < even round
- # 1 5 9 13
- # 2 6 10 14
- # 3 7 11 15
- # 0 5 10 15 < odd round
- # 1 6 11 12
- # 2 7 8 13
- # 3 4 9 14
- #
- # 'a', 'b' and 'd's are permanently allocated in registers,
- # @x[0..7,12..15], while 'c's are maintained in memory. If
- # you observe 'c' column, you'll notice that pair of 'c's is
- # invariant between rounds. This means that we have to reload
- # them once per round, in the middle. This is why you'll see
- # bunch of 'c' stores and loads in the middle, but none in
- # the beginning or end.
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpshufb (@x[$d0],@x[$d0],$t1)",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpshufb (@x[$d1],@x[$d1],$t1)",
-
- "&vpaddd ($xc,$xc,@x[$d0])",
- "&vpxor (@x[$b0],$xc,@x[$b0])",
- "&vpslld ($t0,@x[$b0],12)",
- "&vpsrld (@x[$b0],@x[$b0],20)",
- "&vpor (@x[$b0],$t0,@x[$b0])",
- "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d1])",
- "&vpxor (@x[$b1],$xc_,@x[$b1])",
- "&vpslld ($t1,@x[$b1],12)",
- "&vpsrld (@x[$b1],@x[$b1],20)",
- "&vpor (@x[$b1],$t1,@x[$b1])",
-
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
- "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
- "&vpshufb (@x[$d0],@x[$d0],$t0)",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
- "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
- "&vpshufb (@x[$d1],@x[$d1],$t0)",
-
- "&vpaddd ($xc,$xc,@x[$d0])",
- "&vpxor (@x[$b0],$xc,@x[$b0])",
- "&vpslld ($t1,@x[$b0],7)",
- "&vpsrld (@x[$b0],@x[$b0],25)",
- "&vpor (@x[$b0],$t1,@x[$b0])",
- "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d1])",
- "&vpxor (@x[$b1],$xc_,@x[$b1])",
- "&vpslld ($t0,@x[$b1],7)",
- "&vpsrld (@x[$b1],@x[$b1],25)",
- "&vpor (@x[$b1],$t0,@x[$b1])",
-
- "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
- "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
- "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
- "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
-
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpshufb (@x[$d2],@x[$d2],$t1)",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vpshufb (@x[$d3],@x[$d3],$t1)",
-
- "&vpaddd ($xc,$xc,@x[$d2])",
- "&vpxor (@x[$b2],$xc,@x[$b2])",
- "&vpslld ($t0,@x[$b2],12)",
- "&vpsrld (@x[$b2],@x[$b2],20)",
- "&vpor (@x[$b2],$t0,@x[$b2])",
- "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d3])",
- "&vpxor (@x[$b3],$xc_,@x[$b3])",
- "&vpslld ($t1,@x[$b3],12)",
- "&vpsrld (@x[$b3],@x[$b3],20)",
- "&vpor (@x[$b3],$t1,@x[$b3])",
-
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
- "&vpshufb (@x[$d2],@x[$d2],$t0)",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
- "&vpshufb (@x[$d3],@x[$d3],$t0)",
-
- "&vpaddd ($xc,$xc,@x[$d2])",
- "&vpxor (@x[$b2],$xc,@x[$b2])",
- "&vpslld ($t1,@x[$b2],7)",
- "&vpsrld (@x[$b2],@x[$b2],25)",
- "&vpor (@x[$b2],$t1,@x[$b2])",
- "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
- "&vpaddd ($xc_,$xc_,@x[$d3])",
- "&vpxor (@x[$b3],$xc_,@x[$b3])",
- "&vpslld ($t0,@x[$b3],7)",
- "&vpsrld (@x[$b3],@x[$b3],25)",
- "&vpor (@x[$b3],$t0,@x[$b3])"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-&declare_function("chacha20_avx2", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_8x:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$0x280+$xframe,%rsp
- and \$-32,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L8x_body:
-___
-$code.=<<___;
- vzeroupper
-
- ################ stack layout
- # +0x00 SIMD equivalent of @x[8-12]
- # ...
- # +0x80 constant copy of key[0-2] smashed by lanes
- # ...
- # +0x200 SIMD counters (with nonce smashed by lanes)
- # ...
- # +0x280
-
- vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
- vbroadcasti128 ($key),$xb3 # key[1]
- vbroadcasti128 16($key),$xt3 # key[2]
- vbroadcasti128 ($counter),$xd3 # key[3]
- lea 0x100(%rsp),%rcx # size optimization
- lea 0x200(%rsp),%rax # size optimization
- lea .Lrot16(%rip),%r9
- lea .Lrot24(%rip),%r11
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
- vpshufd \$0xaa,$xa3,$xa2
- vmovdqa $xa1,0xa0-0x100(%rcx)
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa $xa2,0xc0-0x100(%rcx)
- vmovdqa $xa3,0xe0-0x100(%rcx)
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vmovdqa $xb0,0x100-0x100(%rcx)
- vpshufd \$0xaa,$xb3,$xb2
- vmovdqa $xb1,0x120-0x100(%rcx)
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa $xb2,0x140-0x100(%rcx)
- vmovdqa $xb3,0x160-0x100(%rcx)
-
- vpshufd \$0x00,$xt3,$xt0 # "xc0"
- vpshufd \$0x55,$xt3,$xt1 # "xc1"
- vmovdqa $xt0,0x180-0x200(%rax)
- vpshufd \$0xaa,$xt3,$xt2 # "xc2"
- vmovdqa $xt1,0x1a0-0x200(%rax)
- vpshufd \$0xff,$xt3,$xt3 # "xc3"
- vmovdqa $xt2,0x1c0-0x200(%rax)
- vmovdqa $xt3,0x1e0-0x200(%rax)
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
- vpshufd \$0xaa,$xd3,$xd2
- vmovdqa $xd1,0x220-0x200(%rax)
- vpshufd \$0xff,$xd3,$xd3
- vmovdqa $xd2,0x240-0x200(%rax)
- vmovdqa $xd3,0x260-0x200(%rax)
-
- jmp .Loop_enter8x
-
-.align 32
-.Loop_outer8x:
- vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
- vmovdqa 0xa0-0x100(%rcx),$xa1
- vmovdqa 0xc0-0x100(%rcx),$xa2
- vmovdqa 0xe0-0x100(%rcx),$xa3
- vmovdqa 0x100-0x100(%rcx),$xb0
- vmovdqa 0x120-0x100(%rcx),$xb1
- vmovdqa 0x140-0x100(%rcx),$xb2
- vmovdqa 0x160-0x100(%rcx),$xb3
- vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
- vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
- vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
- vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
- vmovdqa 0x200-0x200(%rax),$xd0
- vmovdqa 0x220-0x200(%rax),$xd1
- vmovdqa 0x240-0x200(%rax),$xd2
- vmovdqa 0x260-0x200(%rax),$xd3
- vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
-
-.Loop_enter8x:
- vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
- vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
- vbroadcasti128 (%r9),$xt3
- vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
- mov \$10,%eax
- jmp .Loop8x
-
-.align 32
-.Loop8x:
-___
- foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop8x
-
- lea 0x200(%rsp),%rax # size optimization
- vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
- vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
- vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
- vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd 0x100-0x100(%rcx),$xb0,$xb0
- vpaddd 0x120-0x100(%rcx),$xb1,$xb1
- vpaddd 0x140-0x100(%rcx),$xb2,$xb2
- vpaddd 0x160-0x100(%rcx),$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xb0,$xa0,$xb0
- vperm2i128 \$0x20,$xb1,$xa1,$xa0
- vperm2i128 \$0x31,$xb1,$xa1,$xb1
- vperm2i128 \$0x20,$xb2,$xa2,$xa1
- vperm2i128 \$0x31,$xb2,$xa2,$xb2
- vperm2i128 \$0x20,$xb3,$xa3,$xa2
- vperm2i128 \$0x31,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
- my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
-$code.=<<___;
- vmovdqa $xa0,0x00(%rsp) # offload $xaN
- vmovdqa $xa1,0x20(%rsp)
- vmovdqa 0x40(%rsp),$xc2 # $xa0
- vmovdqa 0x60(%rsp),$xc3 # $xa1
-
- vpaddd 0x180-0x200(%rax),$xc0,$xc0
- vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
- vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
- vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd 0x200-0x200(%rax),$xd0,$xd0
- vpaddd 0x220-0x200(%rax),$xd1,$xd1
- vpaddd 0x240-0x200(%rax),$xd2,$xd2
- vpaddd 0x260-0x200(%rax),$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xd0,$xc0,$xd0
- vperm2i128 \$0x20,$xd1,$xc1,$xc0
- vperm2i128 \$0x31,$xd1,$xc1,$xd1
- vperm2i128 \$0x20,$xd2,$xc2,$xc1
- vperm2i128 \$0x31,$xd2,$xc2,$xd2
- vperm2i128 \$0x20,$xd3,$xc3,$xc2
- vperm2i128 \$0x31,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
- ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
- ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
- ($xa0,$xa1)=($xt2,$xt3);
-$code.=<<___;
- vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
- vmovdqa 0x20(%rsp),$xa1
-
- cmp \$64*8,$len
- jb .Ltail8x
-
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vpxor 0x40($inp),$xc1,$xc1
- vpxor 0x60($inp),$xd1,$xd1
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa1,0x00($out)
- vmovdqu $xb1,0x20($out)
- vmovdqu $xc1,0x40($out)
- vmovdqu $xd1,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vpxor 0x40($inp),$xc2,$xc2
- vpxor 0x60($inp),$xd2,$xd2
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa2,0x00($out)
- vmovdqu $xb2,0x20($out)
- vmovdqu $xc2,0x40($out)
- vmovdqu $xd2,0x60($out)
- lea 0x80($out),$out # size optimization
-
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vpxor 0x40($inp),$xc3,$xc3
- vpxor 0x60($inp),$xd3,$xd3
- lea 0x80($inp),$inp # size optimization
- vmovdqu $xa3,0x00($out)
- vmovdqu $xb3,0x20($out)
- vmovdqu $xc3,0x40($out)
- vmovdqu $xd3,0x60($out)
- lea 0x80($out),$out # size optimization
-
- sub \$64*8,$len
- jnz .Loop_outer8x
-
- jmp .Ldone8x
-
-.Ltail8x:
- cmp \$448,$len
- jae .L448_or_more8x
- cmp \$384,$len
- jae .L384_or_more8x
- cmp \$320,$len
- jae .L320_or_more8x
- cmp \$256,$len
- jae .L256_or_more8x
- cmp \$192,$len
- jae .L192_or_more8x
- cmp \$128,$len
- jae .L128_or_more8x
- cmp \$64,$len
- jae .L64_or_more8x
-
- xor %r9,%r9
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L64_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- je .Ldone8x
-
- lea 0x40($inp),$inp # inp+=64*1
- xor %r9,%r9
- vmovdqa $xc0,0x00(%rsp)
- lea 0x40($out),$out # out+=64*1
- sub \$64,$len # len-=64*1
- vmovdqa $xd0,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L128_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- je .Ldone8x
-
- lea 0x80($inp),$inp # inp+=64*2
- xor %r9,%r9
- vmovdqa $xa1,0x00(%rsp)
- lea 0x80($out),$out # out+=64*2
- sub \$128,$len # len-=64*2
- vmovdqa $xb1,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L192_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- je .Ldone8x
-
- lea 0xc0($inp),$inp # inp+=64*3
- xor %r9,%r9
- vmovdqa $xc1,0x00(%rsp)
- lea 0xc0($out),$out # out+=64*3
- sub \$192,$len # len-=64*3
- vmovdqa $xd1,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L256_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- je .Ldone8x
-
- lea 0x100($inp),$inp # inp+=64*4
- xor %r9,%r9
- vmovdqa $xa2,0x00(%rsp)
- lea 0x100($out),$out # out+=64*4
- sub \$256,$len # len-=64*4
- vmovdqa $xb2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L320_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- je .Ldone8x
-
- lea 0x140($inp),$inp # inp+=64*5
- xor %r9,%r9
- vmovdqa $xc2,0x00(%rsp)
- lea 0x140($out),$out # out+=64*5
- sub \$320,$len # len-=64*5
- vmovdqa $xd2,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L384_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vpxor 0x140($inp),$xc2,$xc2
- vpxor 0x160($inp),$xd2,$xd2
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- vmovdqu $xc2,0x140($out)
- vmovdqu $xd2,0x160($out)
- je .Ldone8x
-
- lea 0x180($inp),$inp # inp+=64*6
- xor %r9,%r9
- vmovdqa $xa3,0x00(%rsp)
- lea 0x180($out),$out # out+=64*6
- sub \$384,$len # len-=64*6
- vmovdqa $xb3,0x20(%rsp)
- jmp .Loop_tail8x
-
-.align 32
-.L448_or_more8x:
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- vpxor 0x80($inp),$xa1,$xa1
- vpxor 0xa0($inp),$xb1,$xb1
- vpxor 0xc0($inp),$xc1,$xc1
- vpxor 0xe0($inp),$xd1,$xd1
- vpxor 0x100($inp),$xa2,$xa2
- vpxor 0x120($inp),$xb2,$xb2
- vpxor 0x140($inp),$xc2,$xc2
- vpxor 0x160($inp),$xd2,$xd2
- vpxor 0x180($inp),$xa3,$xa3
- vpxor 0x1a0($inp),$xb3,$xb3
- vmovdqu $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- vmovdqu $xa1,0x80($out)
- vmovdqu $xb1,0xa0($out)
- vmovdqu $xc1,0xc0($out)
- vmovdqu $xd1,0xe0($out)
- vmovdqu $xa2,0x100($out)
- vmovdqu $xb2,0x120($out)
- vmovdqu $xc2,0x140($out)
- vmovdqu $xd2,0x160($out)
- vmovdqu $xa3,0x180($out)
- vmovdqu $xb3,0x1a0($out)
- je .Ldone8x
-
- lea 0x1c0($inp),$inp # inp+=64*7
- xor %r9,%r9
- vmovdqa $xc3,0x00(%rsp)
- lea 0x1c0($out),$out # out+=64*7
- sub \$448,$len # len-=64*7
- vmovdqa $xd3,0x20(%rsp)
-
-.Loop_tail8x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail8x
-
-.Ldone8x:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L8x_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx2");
-if($kernel) {
- $code .= "#endif\n";
-}
-}
-
-########################################################################
-# AVX512 code paths
-if ($avx>2) {
-# This one handles shorter inputs...
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX512\n";
-}
-
-my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
-my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
-
-sub vpxord() # size optimization
-{ my $opcode = "vpxor"; # adhere to vpxor when possible
-
- foreach (@_) {
- if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
- $opcode = "vpxord";
- last;
- }
- }
-
- $code .= "\t$opcode\t".join(',',reverse @_)."\n";
-}
-
-sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
- &vpaddd ($a,$a,$b);
- &vpxord ($d,$d,$a);
- &vprold ($d,$d,16);
-
- &vpaddd ($c,$c,$d);
- &vpxord ($b,$b,$c);
- &vprold ($b,$b,12);
-
- &vpaddd ($a,$a,$b);
- &vpxord ($d,$d,$a);
- &vprold ($d,$d,8);
-
- &vpaddd ($c,$c,$d);
- &vpxord ($b,$b,$c);
- &vprold ($b,$b,7);
-}
-
-my $xframe = $win64 ? 32+8 : 8;
-
-&declare_function("chacha20_avx512", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_avx512:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- cmp \$512,$len
- ja .Lchacha20_16x
-
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lavx512_body:
-___
-$code.=<<___;
- vbroadcasti32x4 .Lsigma(%rip),$a
- vbroadcasti32x4 ($key),$b
- vbroadcasti32x4 16($key),$c
- vbroadcasti32x4 ($counter),$d
-
- vmovdqa32 $a,$a_
- vmovdqa32 $b,$b_
- vmovdqa32 $c,$c_
- vpaddd .Lzeroz(%rip),$d,$d
- vmovdqa32 .Lfourz(%rip),$fourz
- mov \$10,$counter # reuse $counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512
-
-.align 16
-.Loop_outer_avx512:
- vmovdqa32 $a_,$a
- vmovdqa32 $b_,$b
- vmovdqa32 $c_,$c
- vpaddd $fourz,$d_,$d
- mov \$10,$counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
-___
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b00111001);
- &vpshufd ($d,$d,0b10010011);
-
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b10010011);
- &vpshufd ($d,$d,0b00111001);
-
- &dec ($counter);
- &jnz (".Loop_avx512");
-
-$code.=<<___;
- vpaddd $a_,$a,$a
- vpaddd $b_,$b,$b
- vpaddd $c_,$c,$c
- vpaddd $d_,$d,$d
-
- sub \$64,$len
- jb .Ltail64_avx512
-
- vpxor 0x00($inp),%x#$a,$t0 # xor with input
- vpxor 0x10($inp),%x#$b,$t1
- vpxor 0x20($inp),%x#$c,$t2
- vpxor 0x30($inp),%x#$d,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$1,$a,$t0
- vextracti32x4 \$1,$b,$t1
- vextracti32x4 \$1,$c,$t2
- vextracti32x4 \$1,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$2,$a,$t0
- vextracti32x4 \$2,$b,$t1
- vextracti32x4 \$2,$c,$t2
- vextracti32x4 \$2,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512
-
- vextracti32x4 \$3,$a,$t0
- vextracti32x4 \$3,$b,$t1
- vextracti32x4 \$3,$c,$t2
- vextracti32x4 \$3,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jnz .Loop_outer_avx512
-
- jmp .Ldone_avx512
-
-.align 16
-.Ltail64_avx512:
- vmovdqa %x#$a,0x00(%rsp)
- vmovdqa %x#$b,0x10(%rsp)
- vmovdqa %x#$c,0x20(%rsp)
- vmovdqa %x#$d,0x30(%rsp)
- add \$64,$len
- jmp .Loop_tail_avx512
-
-.align 16
-.Ltail_avx512:
- vmovdqa $t0,0x00(%rsp)
- vmovdqa $t1,0x10(%rsp)
- vmovdqa $t2,0x20(%rsp)
- vmovdqa $t3,0x30(%rsp)
- add \$64,$len
-
-.Loop_tail_avx512:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_avx512
-
- vmovdqu32 $a_,0x00(%rsp)
-
-.Ldone_avx512:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lavx512_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx512");
-
-map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
-
-&declare_function("chacha20_avx512vl", 32, 5);
-$code.=<<___;
-.cfi_startproc
-.Lchacha20_avx512vl:
- lea 8(%rsp),%r10 # frame pointer
-.cfi_def_cfa_register %r10
- cmp \$128,$len
- ja .Lchacha20_8xvl
-
- sub \$64+$xframe,%rsp
- and \$-32,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0x30(%r10)
- movaps %xmm7,-0x20(%r10)
-.Lavx512vl_body:
-___
-$code.=<<___;
- vbroadcasti128 .Lsigma(%rip),$a
- vbroadcasti128 ($key),$b
- vbroadcasti128 16($key),$c
- vbroadcasti128 ($counter),$d
-
- vmovdqa32 $a,$a_
- vmovdqa32 $b,$b_
- vmovdqa32 $c,$c_
- vpaddd .Lzeroz(%rip),$d,$d
- vmovdqa32 .Ltwoy(%rip),$fourz
- mov \$10,$counter # reuse $counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512vl
-
-.align 16
-.Loop_outer_avx512vl:
- vmovdqa32 $c_,$c
- vpaddd $fourz,$d_,$d
- mov \$10,$counter
- vmovdqa32 $d,$d_
- jmp .Loop_avx512vl
-
-.align 32
-.Loop_avx512vl:
-___
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b00111001);
- &vpshufd ($d,$d,0b10010011);
-
- &AVX512ROUND();
- &vpshufd ($c,$c,0b01001110);
- &vpshufd ($b,$b,0b10010011);
- &vpshufd ($d,$d,0b00111001);
-
- &dec ($counter);
- &jnz (".Loop_avx512vl");
-
-$code.=<<___;
- vpaddd $a_,$a,$a
- vpaddd $b_,$b,$b
- vpaddd $c_,$c,$c
- vpaddd $d_,$d,$d
-
- sub \$64,$len
- jb .Ltail64_avx512vl
-
- vpxor 0x00($inp),%x#$a,$t0 # xor with input
- vpxor 0x10($inp),%x#$b,$t1
- vpxor 0x20($inp),%x#$c,$t2
- vpxor 0x30($inp),%x#$d,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- jz .Ldone_avx512vl
-
- vextracti128 \$1,$a,$t0
- vextracti128 \$1,$b,$t1
- vextracti128 \$1,$c,$t2
- vextracti128 \$1,$d,$t3
-
- sub \$64,$len
- jb .Ltail_avx512vl
-
- vpxor 0x00($inp),$t0,$t0 # xor with input
- vpxor 0x10($inp),$t1,$t1
- vpxor 0x20($inp),$t2,$t2
- vpxor 0x30($inp),$t3,$t3
- lea 0x40($inp),$inp # inp+=64
-
- vmovdqu $t0,0x00($out) # write output
- vmovdqu $t1,0x10($out)
- vmovdqu $t2,0x20($out)
- vmovdqu $t3,0x30($out)
- lea 0x40($out),$out # out+=64
-
- vmovdqa32 $a_,$a
- vmovdqa32 $b_,$b
- jnz .Loop_outer_avx512vl
-
- jmp .Ldone_avx512vl
-
-.align 16
-.Ltail64_avx512vl:
- vmovdqa %x#$a,0x00(%rsp)
- vmovdqa %x#$b,0x10(%rsp)
- vmovdqa %x#$c,0x20(%rsp)
- vmovdqa %x#$d,0x30(%rsp)
- add \$64,$len
- jmp .Loop_tail_avx512vl
-
-.align 16
-.Ltail_avx512vl:
- vmovdqa $t0,0x00(%rsp)
- vmovdqa $t1,0x10(%rsp)
- vmovdqa $t2,0x20(%rsp)
- vmovdqa $t3,0x30(%rsp)
- add \$64,$len
-
-.Loop_tail_avx512vl:
- movzb ($inp,$counter),%eax
- movzb (%rsp,$counter),%ecx
- lea 1($counter),$counter
- xor %ecx,%eax
- mov %al,-1($out,$counter)
- dec $len
- jnz .Loop_tail_avx512vl
-
- vmovdqu32 $a_,0x00(%rsp)
- vmovdqu32 $a_,0x20(%rsp)
-
-.Ldone_avx512vl:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0x30(%r10),%xmm6
- movaps -0x20(%r10),%xmm7
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.Lavx512vl_epilogue:
- ret
-.cfi_endproc
-___
-&end_function("chacha20_avx512vl");
-
-# This one handles longer inputs...
-
-my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
-my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-my @key=map("%zmm$_",(16..31));
-my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-sub AVX512_lane_ROUND {
-my ($a0,$b0,$c0,$d0)=@_;
-my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
-my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
-my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my @x=map("\"$_\"",@xx);
-
- (
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
- "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
- "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
- "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
- "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
- "&vprold (@x[$d0],@x[$d0],16)",
- "&vprold (@x[$d1],@x[$d1],16)",
- "&vprold (@x[$d2],@x[$d2],16)",
- "&vprold (@x[$d3],@x[$d3],16)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
- "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
- "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
- "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
- "&vprold (@x[$b0],@x[$b0],12)",
- "&vprold (@x[$b1],@x[$b1],12)",
- "&vprold (@x[$b2],@x[$b2],12)",
- "&vprold (@x[$b3],@x[$b3],12)",
-
- "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
- "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
- "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
- "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
- "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
- "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
- "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
- "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
- "&vprold (@x[$d0],@x[$d0],8)",
- "&vprold (@x[$d1],@x[$d1],8)",
- "&vprold (@x[$d2],@x[$d2],8)",
- "&vprold (@x[$d3],@x[$d3],8)",
-
- "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
- "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
- "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
- "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
- "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
- "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
- "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
- "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
- "&vprold (@x[$b0],@x[$b0],7)",
- "&vprold (@x[$b1],@x[$b1],7)",
- "&vprold (@x[$b2],@x[$b2],7)",
- "&vprold (@x[$b3],@x[$b3],7)"
- );
-}
-
-my $xframe = $win64 ? 0xa8 : 8;
-
-$code.=<<___;
-.type chacha20_16x,\@function,5
-.align 32
-chacha20_16x:
-.cfi_startproc
-.Lchacha20_16x:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L16x_body:
-___
-$code.=<<___;
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti32x4 (%r9),$xa3 # key[0]
- vbroadcasti32x4 ($key),$xb3 # key[1]
- vbroadcasti32x4 16($key),$xc3 # key[2]
- vbroadcasti32x4 ($counter),$xd3 # key[3]
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vpshufd \$0xaa,$xa3,$xa2
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vpshufd \$0xaa,$xb3,$xb2
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa64 $xb0,@key[4]
- vmovdqa64 $xb1,@key[5]
- vmovdqa64 $xb2,@key[6]
- vmovdqa64 $xb3,@key[7]
-
- vpshufd \$0x00,$xc3,$xc0
- vpshufd \$0x55,$xc3,$xc1
- vpshufd \$0xaa,$xc3,$xc2
- vpshufd \$0xff,$xc3,$xc3
- vmovdqa64 $xc0,@key[8]
- vmovdqa64 $xc1,@key[9]
- vmovdqa64 $xc2,@key[10]
- vmovdqa64 $xc3,@key[11]
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpshufd \$0xaa,$xd3,$xd2
- vpshufd \$0xff,$xd3,$xd3
- vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
- vmovdqa64 $xd0,@key[12]
- vmovdqa64 $xd1,@key[13]
- vmovdqa64 $xd2,@key[14]
- vmovdqa64 $xd3,@key[15]
-
- mov \$10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop_outer16x:
- vpbroadcastd 0(%r9),$xa0 # reload key
- vpbroadcastd 4(%r9),$xa1
- vpbroadcastd 8(%r9),$xa2
- vpbroadcastd 12(%r9),$xa3
- vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
- vmovdqa64 @key[4],$xb0
- vmovdqa64 @key[5],$xb1
- vmovdqa64 @key[6],$xb2
- vmovdqa64 @key[7],$xb3
- vmovdqa64 @key[8],$xc0
- vmovdqa64 @key[9],$xc1
- vmovdqa64 @key[10],$xc2
- vmovdqa64 @key[11],$xc3
- vmovdqa64 @key[12],$xd0
- vmovdqa64 @key[13],$xd1
- vmovdqa64 @key[14],$xd2
- vmovdqa64 @key[15],$xd3
-
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- mov \$10,%eax
- jmp .Loop16x
-
-.align 32
-.Loop16x:
-___
- foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop16x
-
- vpaddd @key[0],$xa0,$xa0 # accumulate key
- vpaddd @key[1],$xa1,$xa1
- vpaddd @key[2],$xa2,$xa2
- vpaddd @key[3],$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd @key[4],$xb0,$xb0
- vpaddd @key[5],$xb1,$xb1
- vpaddd @key[6],$xb2,$xb2
- vpaddd @key[7],$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
- vshufi32x4 \$0xee,$xb0,$xa0,$xb0
- vshufi32x4 \$0x44,$xb1,$xa1,$xa0
- vshufi32x4 \$0xee,$xb1,$xa1,$xb1
- vshufi32x4 \$0x44,$xb2,$xa2,$xa1
- vshufi32x4 \$0xee,$xb2,$xa2,$xb2
- vshufi32x4 \$0x44,$xb3,$xa3,$xa2
- vshufi32x4 \$0xee,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
- vpaddd @key[8],$xc0,$xc0
- vpaddd @key[9],$xc1,$xc1
- vpaddd @key[10],$xc2,$xc2
- vpaddd @key[11],$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd @key[12],$xd0,$xd0
- vpaddd @key[13],$xd1,$xd1
- vpaddd @key[14],$xd2,$xd2
- vpaddd @key[15],$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
- vshufi32x4 \$0xee,$xd0,$xc0,$xd0
- vshufi32x4 \$0x44,$xd1,$xc1,$xc0
- vshufi32x4 \$0xee,$xd1,$xc1,$xd1
- vshufi32x4 \$0x44,$xd2,$xc2,$xc1
- vshufi32x4 \$0xee,$xd2,$xc2,$xd2
- vshufi32x4 \$0x44,$xd3,$xc3,$xc2
- vshufi32x4 \$0xee,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
-$code.=<<___;
- vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
- vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
- vshufi32x4 \$0x88,$xd0,$xb0,$xc0
- vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
- vshufi32x4 \$0x88,$xc1,$xa1,$xt1
- vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
- vshufi32x4 \$0x88,$xd1,$xb1,$xc1
- vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
- vshufi32x4 \$0x88,$xc2,$xa2,$xt2
- vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
- vshufi32x4 \$0x88,$xd2,$xb2,$xc2
- vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
- vshufi32x4 \$0x88,$xc3,$xa3,$xt3
- vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
- vshufi32x4 \$0x88,$xd3,$xb3,$xc3
- vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
-___
- ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
- ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
-
- ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
- $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
- ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-$code.=<<___;
- cmp \$64*16,$len
- jb .Ltail16x
-
- vpxord 0x00($inp),$xa0,$xa0 # xor with input
- vpxord 0x40($inp),$xb0,$xb0
- vpxord 0x80($inp),$xc0,$xc0
- vpxord 0xc0($inp),$xd0,$xd0
- vmovdqu32 $xa0,0x00($out)
- vmovdqu32 $xb0,0x40($out)
- vmovdqu32 $xc0,0x80($out)
- vmovdqu32 $xd0,0xc0($out)
-
- vpxord 0x100($inp),$xa1,$xa1
- vpxord 0x140($inp),$xb1,$xb1
- vpxord 0x180($inp),$xc1,$xc1
- vpxord 0x1c0($inp),$xd1,$xd1
- vmovdqu32 $xa1,0x100($out)
- vmovdqu32 $xb1,0x140($out)
- vmovdqu32 $xc1,0x180($out)
- vmovdqu32 $xd1,0x1c0($out)
-
- vpxord 0x200($inp),$xa2,$xa2
- vpxord 0x240($inp),$xb2,$xb2
- vpxord 0x280($inp),$xc2,$xc2
- vpxord 0x2c0($inp),$xd2,$xd2
- vmovdqu32 $xa2,0x200($out)
- vmovdqu32 $xb2,0x240($out)
- vmovdqu32 $xc2,0x280($out)
- vmovdqu32 $xd2,0x2c0($out)
-
- vpxord 0x300($inp),$xa3,$xa3
- vpxord 0x340($inp),$xb3,$xb3
- vpxord 0x380($inp),$xc3,$xc3
- vpxord 0x3c0($inp),$xd3,$xd3
- lea 0x400($inp),$inp
- vmovdqu32 $xa3,0x300($out)
- vmovdqu32 $xb3,0x340($out)
- vmovdqu32 $xc3,0x380($out)
- vmovdqu32 $xd3,0x3c0($out)
- lea 0x400($out),$out
-
- sub \$64*16,$len
- jnz .Loop_outer16x
-
- jmp .Ldone16x
-
-.align 32
-.Ltail16x:
- xor %r9,%r9
- sub $inp,$out
- cmp \$64*1,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa0,$xa0 # xor with input
- vmovdqu32 $xa0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*2,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb0,$xb0
- vmovdqu32 $xb0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*3,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc0,$xc0
- vmovdqu32 $xc0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd0,$xa0
- lea 64($inp),$inp
-
- cmp \$64*4,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd0,$xd0
- vmovdqu32 $xd0,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*5,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa1,$xa1
- vmovdqu32 $xa1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*6,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb1,$xb1
- vmovdqu32 $xb1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*7,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc1,$xc1
- vmovdqu32 $xc1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd1,$xa0
- lea 64($inp),$inp
-
- cmp \$64*8,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd1,$xd1
- vmovdqu32 $xd1,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*9,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa2,$xa2
- vmovdqu32 $xa2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*10,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb2,$xb2
- vmovdqu32 $xb2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*11,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc2,$xc2
- vmovdqu32 $xc2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd2,$xa0
- lea 64($inp),$inp
-
- cmp \$64*12,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xd2,$xd2
- vmovdqu32 $xd2,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xa3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*13,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xa3,$xa3
- vmovdqu32 $xa3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xb3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*14,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xb3,$xb3
- vmovdqu32 $xb3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xc3,$xa0
- lea 64($inp),$inp
-
- cmp \$64*15,$len
- jb .Less_than_64_16x
- vpxord ($inp),$xc3,$xc3
- vmovdqu32 $xc3,($out,$inp)
- je .Ldone16x
- vmovdqa32 $xd3,$xa0
- lea 64($inp),$inp
-
-.Less_than_64_16x:
- vmovdqa32 $xa0,0x00(%rsp)
- lea ($out,$inp),$out
- and \$63,$len
-
-.Loop_tail16x:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail16x
-
- vpxord $xa0,$xa0,$xa0
- vmovdqa32 $xa0,0(%rsp)
-
-.Ldone16x:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L16x_epilogue:
- ret
-.cfi_endproc
-.size chacha20_16x,.-chacha20_16x
-___
-
-# switch to %ymm domain
-($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
-@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
- $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
-@key=map("%ymm$_",(16..31));
-($xt0,$xt1,$xt2,$xt3)=@key[0..3];
-
-$code.=<<___;
-.type chacha20_8xvl,\@function,5
-.align 32
-chacha20_8xvl:
-.cfi_startproc
-.Lchacha20_8xvl:
- lea 8(%rsp),%r10 # frame register
-.cfi_def_cfa_register %r10
- sub \$64+$xframe,%rsp
- and \$-64,%rsp
-___
-$code.=<<___ if ($win64);
- movaps %xmm6,-0xb0(%r10)
- movaps %xmm7,-0xa0(%r10)
- movaps %xmm8,-0x90(%r10)
- movaps %xmm9,-0x80(%r10)
- movaps %xmm10,-0x70(%r10)
- movaps %xmm11,-0x60(%r10)
- movaps %xmm12,-0x50(%r10)
- movaps %xmm13,-0x40(%r10)
- movaps %xmm14,-0x30(%r10)
- movaps %xmm15,-0x20(%r10)
-.L8xvl_body:
-___
-$code.=<<___;
- vzeroupper
-
- lea .Lsigma(%rip),%r9
- vbroadcasti128 (%r9),$xa3 # key[0]
- vbroadcasti128 ($key),$xb3 # key[1]
- vbroadcasti128 16($key),$xc3 # key[2]
- vbroadcasti128 ($counter),$xd3 # key[3]
-
- vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
- vpshufd \$0x55,$xa3,$xa1
- vpshufd \$0xaa,$xa3,$xa2
- vpshufd \$0xff,$xa3,$xa3
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- vpshufd \$0x00,$xb3,$xb0
- vpshufd \$0x55,$xb3,$xb1
- vpshufd \$0xaa,$xb3,$xb2
- vpshufd \$0xff,$xb3,$xb3
- vmovdqa64 $xb0,@key[4]
- vmovdqa64 $xb1,@key[5]
- vmovdqa64 $xb2,@key[6]
- vmovdqa64 $xb3,@key[7]
-
- vpshufd \$0x00,$xc3,$xc0
- vpshufd \$0x55,$xc3,$xc1
- vpshufd \$0xaa,$xc3,$xc2
- vpshufd \$0xff,$xc3,$xc3
- vmovdqa64 $xc0,@key[8]
- vmovdqa64 $xc1,@key[9]
- vmovdqa64 $xc2,@key[10]
- vmovdqa64 $xc3,@key[11]
-
- vpshufd \$0x00,$xd3,$xd0
- vpshufd \$0x55,$xd3,$xd1
- vpshufd \$0xaa,$xd3,$xd2
- vpshufd \$0xff,$xd3,$xd3
- vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
- vmovdqa64 $xd0,@key[12]
- vmovdqa64 $xd1,@key[13]
- vmovdqa64 $xd2,@key[14]
- vmovdqa64 $xd3,@key[15]
-
- mov \$10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop_outer8xvl:
- #vpbroadcastd 0(%r9),$xa0 # reload key
- #vpbroadcastd 4(%r9),$xa1
- vpbroadcastd 8(%r9),$xa2
- vpbroadcastd 12(%r9),$xa3
- vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
- vmovdqa64 @key[4],$xb0
- vmovdqa64 @key[5],$xb1
- vmovdqa64 @key[6],$xb2
- vmovdqa64 @key[7],$xb3
- vmovdqa64 @key[8],$xc0
- vmovdqa64 @key[9],$xc1
- vmovdqa64 @key[10],$xc2
- vmovdqa64 @key[11],$xc3
- vmovdqa64 @key[12],$xd0
- vmovdqa64 @key[13],$xd1
- vmovdqa64 @key[14],$xd2
- vmovdqa64 @key[15],$xd3
-
- vmovdqa64 $xa0,@key[0]
- vmovdqa64 $xa1,@key[1]
- vmovdqa64 $xa2,@key[2]
- vmovdqa64 $xa3,@key[3]
-
- mov \$10,%eax
- jmp .Loop8xvl
-
-.align 32
-.Loop8xvl:
-___
- foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
- foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- dec %eax
- jnz .Loop8xvl
-
- vpaddd @key[0],$xa0,$xa0 # accumulate key
- vpaddd @key[1],$xa1,$xa1
- vpaddd @key[2],$xa2,$xa2
- vpaddd @key[3],$xa3,$xa3
-
- vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
- vpunpckldq $xa3,$xa2,$xt3
- vpunpckhdq $xa1,$xa0,$xa0
- vpunpckhdq $xa3,$xa2,$xa2
- vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
- vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
- vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
-___
- ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
-$code.=<<___;
- vpaddd @key[4],$xb0,$xb0
- vpaddd @key[5],$xb1,$xb1
- vpaddd @key[6],$xb2,$xb2
- vpaddd @key[7],$xb3,$xb3
-
- vpunpckldq $xb1,$xb0,$xt2
- vpunpckldq $xb3,$xb2,$xt3
- vpunpckhdq $xb1,$xb0,$xb0
- vpunpckhdq $xb3,$xb2,$xb2
- vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
- vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
- vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
-___
- ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
-$code.=<<___;
- vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
- vshufi32x4 \$3,$xb0,$xa0,$xb0
- vshufi32x4 \$0,$xb1,$xa1,$xa0
- vshufi32x4 \$3,$xb1,$xa1,$xb1
- vshufi32x4 \$0,$xb2,$xa2,$xa1
- vshufi32x4 \$3,$xb2,$xa2,$xb2
- vshufi32x4 \$0,$xb3,$xa3,$xa2
- vshufi32x4 \$3,$xb3,$xa3,$xb3
-___
- ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
-$code.=<<___;
- vpaddd @key[8],$xc0,$xc0
- vpaddd @key[9],$xc1,$xc1
- vpaddd @key[10],$xc2,$xc2
- vpaddd @key[11],$xc3,$xc3
-
- vpunpckldq $xc1,$xc0,$xt2
- vpunpckldq $xc3,$xc2,$xt3
- vpunpckhdq $xc1,$xc0,$xc0
- vpunpckhdq $xc3,$xc2,$xc2
- vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
- vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
- vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
-___
- ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
-$code.=<<___;
- vpaddd @key[12],$xd0,$xd0
- vpaddd @key[13],$xd1,$xd1
- vpaddd @key[14],$xd2,$xd2
- vpaddd @key[15],$xd3,$xd3
-
- vpunpckldq $xd1,$xd0,$xt2
- vpunpckldq $xd3,$xd2,$xt3
- vpunpckhdq $xd1,$xd0,$xd0
- vpunpckhdq $xd3,$xd2,$xd2
- vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
- vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
- vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
- vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
-___
- ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
-$code.=<<___;
- vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
- vperm2i128 \$0x31,$xd0,$xc0,$xd0
- vperm2i128 \$0x20,$xd1,$xc1,$xc0
- vperm2i128 \$0x31,$xd1,$xc1,$xd1
- vperm2i128 \$0x20,$xd2,$xc2,$xc1
- vperm2i128 \$0x31,$xd2,$xc2,$xd2
- vperm2i128 \$0x20,$xd3,$xc3,$xc2
- vperm2i128 \$0x31,$xd3,$xc3,$xd3
-___
- ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
- ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
- ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
-$code.=<<___;
- cmp \$64*8,$len
- jb .Ltail8xvl
-
- mov \$0x80,%eax # size optimization
- vpxord 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vpxor 0x40($inp),$xc0,$xc0
- vpxor 0x60($inp),$xd0,$xd0
- lea ($inp,%rax),$inp # size optimization
- vmovdqu32 $xa0,0x00($out)
- vmovdqu $xb0,0x20($out)
- vmovdqu $xc0,0x40($out)
- vmovdqu $xd0,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vpxor 0x40($inp),$xc1,$xc1
- vpxor 0x60($inp),$xd1,$xd1
- lea ($inp,%rax),$inp # size optimization
- vmovdqu $xa1,0x00($out)
- vmovdqu $xb1,0x20($out)
- vmovdqu $xc1,0x40($out)
- vmovdqu $xd1,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxord 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vpxor 0x40($inp),$xc2,$xc2
- vpxor 0x60($inp),$xd2,$xd2
- lea ($inp,%rax),$inp # size optimization
- vmovdqu32 $xa2,0x00($out)
- vmovdqu $xb2,0x20($out)
- vmovdqu $xc2,0x40($out)
- vmovdqu $xd2,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vpxor 0x40($inp),$xc3,$xc3
- vpxor 0x60($inp),$xd3,$xd3
- lea ($inp,%rax),$inp # size optimization
- vmovdqu $xa3,0x00($out)
- vmovdqu $xb3,0x20($out)
- vmovdqu $xc3,0x40($out)
- vmovdqu $xd3,0x60($out)
- lea ($out,%rax),$out # size optimization
-
- vpbroadcastd 0(%r9),%ymm0 # reload key
- vpbroadcastd 4(%r9),%ymm1
-
- sub \$64*8,$len
- jnz .Loop_outer8xvl
-
- jmp .Ldone8xvl
-
-.align 32
-.Ltail8xvl:
- vmovdqa64 $xa0,%ymm8 # size optimization
-___
-$xa0 = "%ymm8";
-$code.=<<___;
- xor %r9,%r9
- sub $inp,$out
- cmp \$64*1,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa0,$xa0 # xor with input
- vpxor 0x20($inp),$xb0,$xb0
- vmovdqu $xa0,0x00($out,$inp)
- vmovdqu $xb0,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc0,$xa0
- vmovdqa $xd0,$xb0
- lea 64($inp),$inp
-
- cmp \$64*2,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc0,$xc0
- vpxor 0x20($inp),$xd0,$xd0
- vmovdqu $xc0,0x00($out,$inp)
- vmovdqu $xd0,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xa1,$xa0
- vmovdqa $xb1,$xb0
- lea 64($inp),$inp
-
- cmp \$64*3,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa1,$xa1
- vpxor 0x20($inp),$xb1,$xb1
- vmovdqu $xa1,0x00($out,$inp)
- vmovdqu $xb1,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc1,$xa0
- vmovdqa $xd1,$xb0
- lea 64($inp),$inp
-
- cmp \$64*4,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc1,$xc1
- vpxor 0x20($inp),$xd1,$xd1
- vmovdqu $xc1,0x00($out,$inp)
- vmovdqu $xd1,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa32 $xa2,$xa0
- vmovdqa $xb2,$xb0
- lea 64($inp),$inp
-
- cmp \$64*5,$len
- jb .Less_than_64_8xvl
- vpxord 0x00($inp),$xa2,$xa2
- vpxor 0x20($inp),$xb2,$xb2
- vmovdqu32 $xa2,0x00($out,$inp)
- vmovdqu $xb2,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc2,$xa0
- vmovdqa $xd2,$xb0
- lea 64($inp),$inp
-
- cmp \$64*6,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xc2,$xc2
- vpxor 0x20($inp),$xd2,$xd2
- vmovdqu $xc2,0x00($out,$inp)
- vmovdqu $xd2,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xa3,$xa0
- vmovdqa $xb3,$xb0
- lea 64($inp),$inp
-
- cmp \$64*7,$len
- jb .Less_than_64_8xvl
- vpxor 0x00($inp),$xa3,$xa3
- vpxor 0x20($inp),$xb3,$xb3
- vmovdqu $xa3,0x00($out,$inp)
- vmovdqu $xb3,0x20($out,$inp)
- je .Ldone8xvl
- vmovdqa $xc3,$xa0
- vmovdqa $xd3,$xb0
- lea 64($inp),$inp
-
-.Less_than_64_8xvl:
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xb0,0x20(%rsp)
- lea ($out,$inp),$out
- and \$63,$len
-
-.Loop_tail8xvl:
- movzb ($inp,%r9),%eax
- movzb (%rsp,%r9),%ecx
- lea 1(%r9),%r9
- xor %ecx,%eax
- mov %al,-1($out,%r9)
- dec $len
- jnz .Loop_tail8xvl
-
- vpxor $xa0,$xa0,$xa0
- vmovdqa $xa0,0x00(%rsp)
- vmovdqa $xa0,0x20(%rsp)
-
-.Ldone8xvl:
- vzeroall
-___
-$code.=<<___ if ($win64);
- movaps -0xb0(%r10),%xmm6
- movaps -0xa0(%r10),%xmm7
- movaps -0x90(%r10),%xmm8
- movaps -0x80(%r10),%xmm9
- movaps -0x70(%r10),%xmm10
- movaps -0x60(%r10),%xmm11
- movaps -0x50(%r10),%xmm12
- movaps -0x40(%r10),%xmm13
- movaps -0x30(%r10),%xmm14
- movaps -0x20(%r10),%xmm15
-___
-$code.=<<___;
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-.L8xvl_epilogue:
- ret
-.cfi_endproc
-.size chacha20_8xvl,.-chacha20_8xvl
-___
-if($kernel) {
- $code .= "#endif\n";
-}
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- lea .Lctr32_body(%rip),%r10
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- lea .Lno_data(%rip),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 64+24+48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
-.size se_handler,.-se_handler
-
-.type simd_handler,\@abi-omnipotent
-.align 16
-simd_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 192($context),%rax # pull context->R9
-
- mov 4(%r11),%r10d # HandlerData[1]
- mov 8(%r11),%ecx # HandlerData[2]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- neg %rcx
- lea -8(%rax,%rcx),%rsi
- lea 512($context),%rdi # &context.Xmm6
- neg %ecx
- shr \$3,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
- jmp .Lcommon_seh_tail
-.size simd_handler,.-simd_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_chacha20_ctr32
- .rva .LSEH_end_chacha20_ctr32
- .rva .LSEH_info_chacha20_ctr32
-
- .rva .LSEH_begin_chacha20_ssse3
- .rva .LSEH_end_chacha20_ssse3
- .rva .LSEH_info_chacha20_ssse3
-
- .rva .LSEH_begin_chacha20_128
- .rva .LSEH_end_chacha20_128
- .rva .LSEH_info_chacha20_128
-
- .rva .LSEH_begin_chacha20_4x
- .rva .LSEH_end_chacha20_4x
- .rva .LSEH_info_chacha20_4x
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_chacha20_xop
- .rva .LSEH_end_chacha20_xop
- .rva .LSEH_info_chacha20_xop
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_chacha20_avx2
- .rva .LSEH_end_chacha20_avx2
- .rva .LSEH_info_chacha20_avx2
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_chacha20_avx512
- .rva .LSEH_end_chacha20_avx512
- .rva .LSEH_info_chacha20_avx512
-
- .rva .LSEH_begin_chacha20_avx512vl
- .rva .LSEH_end_chacha20_avx512vl
- .rva .LSEH_info_chacha20_avx512vl
-
- .rva .LSEH_begin_chacha20_16x
- .rva .LSEH_end_chacha20_16x
- .rva .LSEH_info_chacha20_16x
-
- .rva .LSEH_begin_chacha20_8xvl
- .rva .LSEH_end_chacha20_8xvl
- .rva .LSEH_info_chacha20_8xvl
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_chacha20_ctr32:
- .byte 9,0,0,0
- .rva se_handler
-
-.LSEH_info_chacha20_ssse3:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lssse3_body,.Lssse3_epilogue
- .long 0x20,0
-
-.LSEH_info_chacha20_128:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L128_body,.L128_epilogue
- .long 0x60,0
-
-.LSEH_info_chacha20_4x:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L4x_body,.L4x_epilogue
- .long 0xa0,0
-___
-$code.=<<___ if ($avx);
-.LSEH_info_chacha20_xop:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
- .long 0xa0,0
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_chacha20_avx2:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L8x_body,.L8x_epilogue # HandlerData[]
- .long 0xa0,0
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_chacha20_avx512:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
- .long 0x20,0
-
-.LSEH_info_chacha20_avx512vl:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
- .long 0x20,0
-
-.LSEH_info_chacha20_16x:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L16x_body,.L16x_epilogue # HandlerData[]
- .long 0xa0,0
-
-.LSEH_info_chacha20_8xvl:
- .byte 9,0,0,0
- .rva simd_handler
- .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
- .long 0xa0,0
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
-
- s/%x#%[yz]/%x/g; # "down-shift"
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-
-close STDOUT;
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c b/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c
deleted file mode 100644
index b78f19975b1d..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * Implementation of the ChaCha20 stream cipher.
- *
- * Information: https://cr.yp.to/chacha.html
- */
-
-#include <zinc/chacha20.h>
-#include "../selftest/run.h"
-#define IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
-
-#define IS_ENABLED_CONFIG_64BIT (sizeof(void*) == 8)
-
-void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len)
-{
- int relalign = 0;
-
- if (!IS_ENABLED_CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) {
- int size = sizeof(unsigned long);
- int d = (((unsigned long)dst ^ (unsigned long)src1) |
- ((unsigned long)dst ^ (unsigned long)src2)) &
- (size - 1);
-
- relalign = d ? 1 << ffs(d) : size;
-
- /*
- * If we care about alignment, process as many bytes as
- * needed to advance dst and src to values whose alignments
- * equal their relative alignment. This will allow us to
- * process the remainder of the input using optimal strides.
- */
- while (((unsigned long)dst & (relalign - 1)) && len > 0) {
- *dst++ = *src1++ ^ *src2++;
- len--;
- }
- }
-
- while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
- *(u64 *)dst = *(const u64 *)src1 ^ *(const u64 *)src2;
- dst += 8;
- src1 += 8;
- src2 += 8;
- len -= 8;
- }
-
- while (len >= 4 && !(relalign & 3)) {
- *(u32 *)dst = *(const u32 *)src1 ^ *(const u32 *)src2;
- dst += 4;
- src1 += 4;
- src2 += 4;
- len -= 4;
- }
-
- while (len >= 2 && !(relalign & 1)) {
- *(u16 *)dst = *(const u16 *)src1 ^ *(const u16 *)src2;
- dst += 2;
- src1 += 2;
- src2 += 2;
- len -= 2;
- }
-
- while (len--)
- *dst++ = *src1++ ^ *src2++;
-}
-
-#if defined(CONFIG_ZINC_ARCH_X86_64)
-#include "chacha20-x86_64-glue.c"
-#elif defined(CONFIG_ZINC_ARCH_ARM) || defined(CONFIG_ZINC_ARCH_ARM64)
-#include "chacha20-arm-glue.c"
-#elif defined(CONFIG_ZINC_ARCH_MIPS)
-#include "chacha20-mips-glue.c"
-#else
-static bool *const chacha20_nobs[] __initconst = { };
-static void __init chacha20_fpu_init(void)
-{
-}
-static inline bool chacha20_arch(struct chacha20_ctx *ctx, u8 *dst,
- const u8 *src, size_t len,
- simd_context_t *simd_context)
-{
- return false;
-}
-static inline bool hchacha20_arch(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE],
- simd_context_t *simd_context)
-{
- return false;
-}
-#endif
-
-#define QUARTER_ROUND(x, a, b, c, d) ( \
- x[a] += x[b], \
- x[d] = rol32((x[d] ^ x[a]), 16), \
- x[c] += x[d], \
- x[b] = rol32((x[b] ^ x[c]), 12), \
- x[a] += x[b], \
- x[d] = rol32((x[d] ^ x[a]), 8), \
- x[c] += x[d], \
- x[b] = rol32((x[b] ^ x[c]), 7) \
-)
-
-#define C(i, j) (i * 4 + j)
-
-#define DOUBLE_ROUND(x) ( \
- /* Column Round */ \
- QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
- QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
- QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
- QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \
- /* Diagonal Round */ \
- QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
- QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
- QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
- QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \
-)
-
-#define TWENTY_ROUNDS(x) ( \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x), \
- DOUBLE_ROUND(x) \
-)
-
-static void chacha20_block_generic(struct chacha20_ctx *ctx, __le32 *stream)
-{
- u32 x[CHACHA20_BLOCK_WORDS];
- int i;
-
- for (i = 0; i < ARRAY_SIZE(x); ++i)
- x[i] = ctx->state[i];
-
- TWENTY_ROUNDS(x);
-
- for (i = 0; i < ARRAY_SIZE(x); ++i)
- stream[i] = cpu_to_le32(x[i] + ctx->state[i]);
-
- ctx->counter[0] += 1;
-}
-
-static void chacha20_generic(struct chacha20_ctx *ctx, u8 *out, const u8 *in,
- u32 len)
-{
- __le32 buf[CHACHA20_BLOCK_WORDS];
-
- while (len >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_generic(ctx, buf);
- crypto_xor_cpy(out, in, (u8 *)buf, CHACHA20_BLOCK_SIZE);
- len -= CHACHA20_BLOCK_SIZE;
- out += CHACHA20_BLOCK_SIZE;
- in += CHACHA20_BLOCK_SIZE;
- }
- if (len) {
- chacha20_block_generic(ctx, buf);
- crypto_xor_cpy(out, in, (u8 *)buf, len);
- }
-}
-
-void chacha20(struct chacha20_ctx *ctx, u8 *dst, const u8 *src, u32 len,
- simd_context_t *simd_context)
-{
- if (!chacha20_arch(ctx, dst, src, len, simd_context))
- chacha20_generic(ctx, dst, src, len);
-}
-EXPORT_SYMBOL(chacha20);
-
-static void hchacha20_generic(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE])
-{
- u32 x[] = { CHACHA20_CONSTANT_EXPA,
- CHACHA20_CONSTANT_ND_3,
- CHACHA20_CONSTANT_2_BY,
- CHACHA20_CONSTANT_TE_K,
- get_unaligned_le32(key + 0),
- get_unaligned_le32(key + 4),
- get_unaligned_le32(key + 8),
- get_unaligned_le32(key + 12),
- get_unaligned_le32(key + 16),
- get_unaligned_le32(key + 20),
- get_unaligned_le32(key + 24),
- get_unaligned_le32(key + 28),
- get_unaligned_le32(nonce + 0),
- get_unaligned_le32(nonce + 4),
- get_unaligned_le32(nonce + 8),
- get_unaligned_le32(nonce + 12)
- };
-
- TWENTY_ROUNDS(x);
-
- memcpy(derived_key + 0, x + 0, sizeof(u32) * 4);
- memcpy(derived_key + 4, x + 12, sizeof(u32) * 4);
-}
-
-/* Derived key should be 32-bit aligned */
-void hchacha20(u32 derived_key[CHACHA20_KEY_WORDS],
- const u8 nonce[HCHACHA20_NONCE_SIZE],
- const u8 key[HCHACHA20_KEY_SIZE], simd_context_t *simd_context)
-{
- if (!hchacha20_arch(derived_key, nonce, key, simd_context))
- hchacha20_generic(derived_key, nonce, key);
-}
-EXPORT_SYMBOL(hchacha20);
-
-#include "../selftest/chacha20.c"
-
-static bool nosimd __initdata = false;
-
-#ifndef COMPAT_ZINC_IS_A_MODULE
-int __init chacha20_mod_init(void)
-#else
-static int __init mod_init(void)
-#endif
-{
- if (!nosimd)
- chacha20_fpu_init();
- if (!selftest_run("chacha20", chacha20_selftest, chacha20_nobs,
- ARRAY_SIZE(chacha20_nobs)))
- return -ENOTRECOVERABLE;
- return 0;
-}
-
-#ifdef COMPAT_ZINC_IS_A_MODULE
-static void __exit mod_exit(void)
-{
-}
-
-module_init(mod_init);
-module_exit(mod_exit);
-#endif
diff --git a/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c b/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c
deleted file mode 100644
index 701666c78eb8..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/chacha20poly1305.c
+++ /dev/null
@@ -1,196 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- *
- * This is an implementation of the ChaCha20Poly1305 AEAD construction.
- *
- * Information: https://tools.ietf.org/html/rfc8439
- */
-
-#include <sys/support.h>
-#include <zinc/chacha20poly1305.h>
-#include <zinc/chacha20.h>
-#include <zinc/poly1305.h>
-#include "selftest/run.h"
-
-static const u8 pad0[CHACHA20_BLOCK_SIZE] = { 0 };
-
-static inline void
-__chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len, const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE],
- simd_context_t *simd_context)
-{
- struct poly1305_ctx poly1305_state;
- struct chacha20_ctx chacha20_state;
- union {
- u8 block0[POLY1305_KEY_SIZE];
- __le64 lens[2];
- } b = { { 0 } };
-
- chacha20_init(&chacha20_state, key, nonce);
- chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
- simd_context);
- poly1305_init(&poly1305_state, b.block0);
-
- poly1305_update(&poly1305_state, ad, ad_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
- simd_context);
-
- chacha20(&chacha20_state, dst, src, src_len, simd_context);
-
- poly1305_update(&poly1305_state, dst, src_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf,
- simd_context);
-
- b.lens[0] = cpu_to_le64(ad_len);
- b.lens[1] = cpu_to_le64(src_len);
- poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
- simd_context);
-
- poly1305_final(&poly1305_state, dst + src_len, simd_context);
-
- memzero_explicit(&chacha20_state, sizeof(chacha20_state));
- memzero_explicit(&b, sizeof(b));
-}
-
-void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- simd_context_t simd_context;
-
- simd_get(&simd_context);
- __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, nonce, key,
- &simd_context);
- simd_put(&simd_context);
-}
-EXPORT_SYMBOL(chacha20poly1305_encrypt);
-static inline bool
-__chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len, const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE],
- simd_context_t *simd_context)
-{
- struct poly1305_ctx poly1305_state;
- struct chacha20_ctx chacha20_state;
- int ret;
- size_t dst_len;
- union {
- u8 block0[POLY1305_KEY_SIZE];
- u8 mac[POLY1305_MAC_SIZE];
- __le64 lens[2];
- } b = { { 0 } };
-
- if (unlikely(src_len < POLY1305_MAC_SIZE)) {
- printf("src_len too short\n");
- return false;
- }
-
- chacha20_init(&chacha20_state, key, nonce);
- chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0),
- simd_context);
- poly1305_init(&poly1305_state, b.block0);
-
- poly1305_update(&poly1305_state, ad, ad_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf,
- simd_context);
-
- dst_len = src_len - POLY1305_MAC_SIZE;
- poly1305_update(&poly1305_state, src, dst_len, simd_context);
- poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf,
- simd_context);
-
- b.lens[0] = cpu_to_le64(ad_len);
- b.lens[1] = cpu_to_le64(dst_len);
- poly1305_update(&poly1305_state, (u8 *)b.lens, sizeof(b.lens),
- simd_context);
-
- poly1305_final(&poly1305_state, b.mac, simd_context);
-
- ret = crypto_memneq(b.mac, src + dst_len, POLY1305_MAC_SIZE);
- if (likely(!ret))
- chacha20(&chacha20_state, dst, src, dst_len, simd_context);
- else {
- printf("calculated: %16D\n", b.mac, "");
- printf("sent : %16D\n", src + dst_len, "");
- }
- memzero_explicit(&chacha20_state, sizeof(chacha20_state));
- memzero_explicit(&b, sizeof(b));
-
- return !ret;
-}
-
-bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u64 nonce,
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- simd_context_t simd_context;
- bool ret;
-
- simd_get(&simd_context);
- ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, nonce,
- key, &simd_context);
- simd_put(&simd_context);
- return ret;
-}
-EXPORT_SYMBOL(chacha20poly1305_decrypt);
-
-void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- simd_context_t simd_context;
- u32 derived_key[CHACHA20_KEY_WORDS] __aligned(16);
-
- simd_get(&simd_context);
- hchacha20(derived_key, nonce, key, &simd_context);
- cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key));
- __chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len,
- get_unaligned_le64(nonce + 16),
- (u8 *)derived_key, &simd_context);
- memzero_explicit(derived_key, CHACHA20POLY1305_KEY_SIZE);
- simd_put(&simd_context);
-}
-EXPORT_SYMBOL(xchacha20poly1305_encrypt);
-
-bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
- const u8 *ad, const size_t ad_len,
- const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
- const u8 key[CHACHA20POLY1305_KEY_SIZE])
-{
- bool ret;
- simd_context_t simd_context;
- u32 derived_key[CHACHA20_KEY_WORDS] __aligned(16);
-
- simd_get(&simd_context);
- hchacha20(derived_key, nonce, key, &simd_context);
- cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key));
- ret = __chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len,
- get_unaligned_le64(nonce + 16),
- (u8 *)derived_key, &simd_context);
- memzero_explicit(derived_key, CHACHA20POLY1305_KEY_SIZE);
- simd_put(&simd_context);
- return ret;
-}
-EXPORT_SYMBOL(xchacha20poly1305_decrypt);
-
-#include "selftest/chacha20poly1305.c"
-
-static int __init mod_init(void)
-{
- if (!selftest_run("chacha20poly1305", chacha20poly1305_selftest,
- NULL, 0))
- return -ENOTRECOVERABLE;
- return 0;
-}
-
-static void __exit mod_exit(void)
-{
-}
-
-module_init(mod_init);
-module_exit(mod_exit);
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c
deleted file mode 100644
index 291fe4ba98b0..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm-glue.c
+++ /dev/null
@@ -1,140 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-
-asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
-asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
-asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
-
-static bool poly1305_use_neon __ro_after_init;
-static bool *const poly1305_nobs[] __initconst = { &poly1305_use_neon };
-
-static void __init poly1305_fpu_init(void)
-{
-#if defined(CONFIG_ZINC_ARCH_ARM64)
- poly1305_use_neon = cpu_have_named_feature(ASIMD);
-#elif defined(CONFIG_ZINC_ARCH_ARM)
- poly1305_use_neon = elf_hwcap & HWCAP_NEON;
-#endif
-}
-
-#if defined(CONFIG_ZINC_ARCH_ARM64)
-struct poly1305_arch_internal {
- union {
- u32 h[5];
- struct {
- u64 h0, h1, h2;
- };
- };
- u64 is_base2_26;
- u64 r[2];
-};
-#elif defined(CONFIG_ZINC_ARCH_ARM)
-struct poly1305_arch_internal {
- union {
- u32 h[5];
- struct {
- u64 h0, h1;
- u32 h2;
- } __packed;
- };
- u32 r[4];
- u32 is_base2_26;
-};
-#endif
-
-/* The NEON code uses base 2^26, while the scalar code uses base 2^64 on 64-bit
- * and base 2^32 on 32-bit. If we hit the unfortunate situation of using NEON
- * and then having to go back to scalar -- because the user is silly and has
- * called the update function from two separate contexts -- then we need to
- * convert back to the original base before proceeding. The below function is
- * written for 64-bit integers, and so we have to swap words at the end on
- * big-endian 32-bit. It is possible to reason that the initial reduction below
- * is sufficient given the implementation invariants. However, for an avoidance
- * of doubt and because this is not performance critical, we do the full
- * reduction anyway.
- */
-static void convert_to_base2_64(void *ctx)
-{
- struct poly1305_arch_internal *state = ctx;
- u32 cy;
-
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !state->is_base2_26)
- return;
-
- cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
- cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
- cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
- cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
- state->h0 = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
- state->h1 = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
- state->h2 = state->h[4] >> 24;
- if (IS_ENABLED(CONFIG_ZINC_ARCH_ARM) && IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) {
- state->h0 = rol64(state->h0, 32);
- state->h1 = rol64(state->h1, 32);
- }
-#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
- cy = (state->h2 >> 2) + (state->h2 & ~3ULL);
- state->h2 &= 3;
- state->h0 += cy;
- state->h1 += (cy = ULT(state->h0, cy));
- state->h2 += ULT(state->h1, cy);
-#undef ULT
- state->is_base2_26 = 0;
-}
-
-static inline bool poly1305_init_arch(void *ctx,
- const u8 key[POLY1305_KEY_SIZE])
-{
- poly1305_init_arm(ctx, key);
- return true;
-}
-
-static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
- size_t len, const u32 padbit,
- simd_context_t *simd_context)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
- PAGE_SIZE % POLY1305_BLOCK_SIZE);
-
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
- !simd_use(simd_context)) {
- convert_to_base2_64(ctx);
- poly1305_blocks_arm(ctx, inp, len, padbit);
- return true;
- }
-
- for (;;) {
- const size_t bytes = min_t(size_t, len, PAGE_SIZE);
-
- poly1305_blocks_neon(ctx, inp, bytes, padbit);
- len -= bytes;
- if (!len)
- break;
- inp += bytes;
- simd_relax(simd_context);
- }
- return true;
-}
-
-static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
- const u32 nonce[4],
- simd_context_t *simd_context)
-{
- if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !poly1305_use_neon ||
- !simd_use(simd_context)) {
- convert_to_base2_64(ctx);
- poly1305_emit_arm(ctx, mac, nonce);
- } else
- poly1305_emit_neon(ctx, mac, nonce);
- return true;
-}
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl
deleted file mode 100755
index 468f41b76fbd..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm.pl
+++ /dev/null
@@ -1,1276 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# IALU(*)/gcc-4.4 NEON
-#
-# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.35/+130% 3.00
-# Cortex-A8 6.25/+115% 2.36
-# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.85/+85% 1.25(**)
-# Snapdragon S4 5.70/+100% 1.48(**)
-#
-# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**) these are trade-off results, they can be improved by ~8% but at
-# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init poly1305_init_arm
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arm
-#endif
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp $inp,#0
- str r3,[$ctx,#0] @ zero hash value
- str r3,[$ctx,#4]
- str r3,[$ctx,#8]
- str r3,[$ctx,#12]
- str r3,[$ctx,#16]
- str r3,[$ctx,#36] @ is_base2_26
- add $ctx,$ctx,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-#endif
- ldrb r4,[$inp,#0]
- mov r10,#0x0fffffff
- ldrb r5,[$inp,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[$inp,#2]
- ldrb r7,[$inp,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[$inp,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[$inp,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[$inp,#6]
- and r4,r4,r10
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
- ldr r12,[r12]
-# endif
-#endif
- ldrb r8,[$inp,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[$inp,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[$inp,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[$inp,#10]
- and r5,r5,r3
-
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
- movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
- movne r12,r10
-# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
-# endif
-#endif
- ldrb r9,[$inp,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[$inp,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[$inp,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[$inp,#14]
- and r6,r6,r3
-
- ldrb r10,[$inp,#15]
- orr r7,r7,r8,lsl#8
- str r4,[$ctx,#0]
- orr r7,r7,r9,lsl#16
- str r5,[$ctx,#4]
- orr r7,r7,r10,lsl#24
- str r6,[$ctx,#8]
- and r7,r7,r3
- str r7,[$ctx,#12]
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
- stmdb sp!,{r3-r11,lr}
-
- ands $len,$len,#-16
- beq .Lno_data
-
- cmp $padbit,#0
- add $len,$len,$inp @ end pointer
- sub sp,sp,#32
-
- ldmia $ctx,{$h0-$r3} @ load context
-
- str $ctx,[sp,#12] @ offload stuff
- mov lr,$inp
- str $len,[sp,#16]
- str $r1,[sp,#20]
- str $r2,[sp,#24]
- str $r3,[sp,#28]
- b .Loop
-
-.Loop:
-#if __ARM_ARCH__<7
- ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds $h0,$h0,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs $h1,$h1,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs $h2,$h2,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add $s1,$r1,$r1,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- adds $h0,$h0,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs $h1,$h1,r1
- add $s1,$r1,$r1,lsr#2
- adcs $h2,$h2,r2
-#endif
- add $s2,$r2,$r2,lsr#2
- adcs $h3,$h3,r3
- add $s3,$r3,$r3,lsr#2
-
- umull r2,r3,$h1,$r0
- adc $h4,$h4,#0
- umull r0,r1,$h0,$r0
- umlal r2,r3,$h4,$s1
- umlal r0,r1,$h3,$s1
- ldr $r1,[sp,#20] @ reload $r1
- umlal r2,r3,$h2,$s3
- umlal r0,r1,$h1,$s3
- umlal r2,r3,$h3,$s2
- umlal r0,r1,$h2,$s2
- umlal r2,r3,$h0,$r1
- str r0,[sp,#0] @ future $h0
- mul r0,$s2,$h4
- ldr $r2,[sp,#24] @ reload $r2
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future $h2
- str r2,[sp,#4] @ future $h1
-
- mul r2,$s3,$h4
- eor r3,r3,r3
- umlal r0,r1,$h3,$s3
- ldr $r3,[sp,#28] @ reload $r3
- umlal r2,r3,$h3,$r0
- umlal r0,r1,$h2,$r0
- umlal r2,r3,$h2,$r1
- umlal r0,r1,$h1,$r1
- umlal r2,r3,$h1,$r2
- umlal r0,r1,$h0,$r2
- umlal r2,r3,$h0,$r3
- ldr $h0,[sp,#0]
- mul $h4,$r0,$h4
- ldr $h1,[sp,#4]
-
- adds $h2,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds $h3,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add $h4,$h4,r3 @ h4+=d3>>32
-
- and r1,$h4,#-4
- and $h4,$h4,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds $h0,$h0,r1
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adcs $h3,$h3,#0
- adc $h4,$h4,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr $ctx,[sp,#12]
- add sp,sp,#32
- stmia $ctx,{$h0-$h4} @ store the result
-
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$h4;
-
-$code.=<<___;
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
- stmdb sp!,{r4-r11}
-.Lpoly1305_emit_enter:
-
- ldmia $ctx,{$h0-$h4}
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0]
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-#else
- strb $h0,[$mac,#0]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#4]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#8]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#12]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#1]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#5]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#9]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#13]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#2]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#6]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#10]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#14]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#3]
- strb $h1,[$mac,#7]
- strb $h2,[$mac,#11]
- strb $h3,[$mac,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7)
-.fpu neon
-
-.type poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
- ldr r4,[$ctx,#20] @ load key base 2^32
- ldr r5,[$ctx,#24]
- ldr r6,[$ctx,#28]
- ldr r7,[$ctx,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 $R0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 $R1,r3
- add r3,r4,r4,lsl#2
- vdup.32 $S1,r2
- vdup.32 $R2,r4
- add r4,r5,r5,lsl#2
- vdup.32 $S2,r3
- vdup.32 $R3,r5
- add r5,r6,r6,lsl#2
- vdup.32 $S3,r4
- vdup.32 $R4,r6
- vdup.32 $S4,r5
-
- mov $zeros,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 $D0,$R0,${R0}[1]
- vmull.u32 $D1,$R1,${R0}[1]
- vmull.u32 $D2,$R2,${R0}[1]
- vmull.u32 $D3,$R3,${R0}[1]
- vmull.u32 $D4,$R4,${R0}[1]
-
- vmlal.u32 $D0,$R4,${S1}[1]
- vmlal.u32 $D1,$R0,${R1}[1]
- vmlal.u32 $D2,$R1,${R1}[1]
- vmlal.u32 $D3,$R2,${R1}[1]
- vmlal.u32 $D4,$R3,${R1}[1]
-
- vmlal.u32 $D0,$R3,${S2}[1]
- vmlal.u32 $D1,$R4,${S2}[1]
- vmlal.u32 $D3,$R1,${R2}[1]
- vmlal.u32 $D2,$R0,${R2}[1]
- vmlal.u32 $D4,$R2,${R2}[1]
-
- vmlal.u32 $D0,$R2,${S3}[1]
- vmlal.u32 $D3,$R0,${R3}[1]
- vmlal.u32 $D1,$R3,${S3}[1]
- vmlal.u32 $D2,$R4,${S3}[1]
- vmlal.u32 $D4,$R1,${R3}[1]
-
- vmlal.u32 $D3,$R4,${S4}[1]
- vmlal.u32 $D0,$R1,${S4}[1]
- vmlal.u32 $D1,$R2,${S4}[1]
- vmlal.u32 $D2,$R3,${S4}[1]
- vmlal.u32 $D4,$R0,${R4}[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
- @
- @ H0>>+H1>>+H2>>+H3>>+H4
- @ H3>>+H4>>*5+H0>>+H1
- @
- @ Trivia.
- @
- @ Result of multiplication of n-bit number by m-bit number is
- @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
- @ m-bit number multiplied by 2^n is still n+m bits wide.
- @
- @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
- @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
- @ one is n+1 bits wide.
- @
- @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
- @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
- @ can be 27. However! In cases when their width exceeds 26 bits
- @ they are limited by 2^26+2^6. This in turn means that *sum*
- @ of the products with these values can still be viewed as sum
- @ of 52-bit numbers as long as the amount of addends is not a
- @ power of 2. For example,
- @
- @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
- @
- @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
- @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
- @ 8 * (2^52) or 2^55. However, the value is then multiplied by
- @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
- @ which is less than 32 * (2^52) or 2^57. And when processing
- @ data we are looking at triple as many addends...
- @
- @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
- @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
- @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
- @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
- @ instruction accepts 2x32-bit input and writes 2x64-bit result.
- @ This means that result of reduction have to be compressed upon
- @ loop wrap-around. This can be done in the process of reduction
- @ to minimize amount of instructions [as well as amount of
- @ 128-bit instructions, which benefits low-end processors], but
- @ one has to watch for H2 (which is narrower than H0) and 5*H4
- @ not being wider than 58 bits, so that result of right shift
- @ by 26 bits fits in 32 bits. This is also useful on x86,
- @ because it allows to use paddd in place for paddq, which
- @ benefits Atom, where paddq is ridiculously slow.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vbic.i32 $D4#lo,#0xfc000000
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vbic.i32 $D2#lo,#0xfc000000
-
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-
- subs $zeros,$zeros,#1
- beq .Lsquare_break_neon
-
- add $tbl0,$ctx,#(48+0*9*4)
- add $tbl1,$ctx,#(48+1*9*4)
-
- vtrn.32 $R0,$D0#lo @ r^2:r^1
- vtrn.32 $R2,$D2#lo
- vtrn.32 $R3,$D3#lo
- vtrn.32 $R1,$D1#lo
- vtrn.32 $R4,$D4#lo
-
- vshl.u32 $S2,$R2,#2 @ *5
- vshl.u32 $S3,$R3,#2
- vshl.u32 $S1,$R1,#2
- vshl.u32 $S4,$R4,#2
- vadd.i32 $S2,$S2,$R2
- vadd.i32 $S1,$S1,$R1
- vadd.i32 $S3,$S3,$R3
- vadd.i32 $S4,$S4,$R4
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0,:32]
- vst1.32 {${S4}[1]},[$tbl1,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add $tbl0,$ctx,#(48+2*4*9)
- add $tbl1,$ctx,#(48+3*4*9)
-
- vmov $R0,$D0#lo @ r^4:r^3
- vshl.u32 $S1,$D1#lo,#2 @ *5
- vmov $R1,$D1#lo
- vshl.u32 $S2,$D2#lo,#2
- vmov $R2,$D2#lo
- vshl.u32 $S3,$D3#lo,#2
- vmov $R3,$D3#lo
- vshl.u32 $S4,$D4#lo,#2
- vmov $R4,$D4#lo
- vadd.i32 $S1,$S1,$D1#lo
- vadd.i32 $S2,$S2,$D2#lo
- vadd.i32 $S3,$S3,$D3#lo
- vadd.i32 $S4,$S4,$D4#lo
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0]
- vst1.32 {${S4}[1]},[$tbl1]
-
- ret @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
-
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-#endif
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
- ands $len,$len,#-16
- beq .Lno_data_neon
-
- cmp $len,#64
- bhs .Lenter_neon
- tst ip,ip @ is_base2_26?
- beq .Lpoly1305_blocks
-
-.Lenter_neon:
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl .Lpoly1305_init_neon
-
- ldr r4,[$ctx,#0] @ load hash value base 2^32
- ldr r5,[$ctx,#4]
- ldr r6,[$ctx,#8]
- ldr r7,[$ctx,#12]
- ldr ip,[$ctx,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor $D0#lo,$D0#lo,$D0#lo
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor $D1#lo,$D1#lo,$D1#lo
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor $D2#lo,$D2#lo,$D2#lo
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor $D3#lo,$D3#lo,$D3#lo
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor $D4#lo,$D4#lo,$D4#lo
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[$ctx,#36] @ is_base2_26
-
- vmov.32 $D0#lo[0],r2
- vmov.32 $D1#lo[0],r3
- vmov.32 $D2#lo[0],r4
- vmov.32 $D3#lo[0],r5
- vmov.32 $D4#lo[0],r6
- adr $zeros,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lbase2_32_neon
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor $D0#lo,$D0#lo,$D0#lo
- veor $D1#lo,$D1#lo,$D1#lo
- veor $D2#lo,$D2#lo,$D2#lo
- veor $D3#lo,$D3#lo,$D3#lo
- veor $D4#lo,$D4#lo,$D4#lo
- vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- adr $zeros,.Lzeros
- vld1.32 {$D4#lo[0]},[$ctx]
- sub $ctx,$ctx,#16 @ rewind
-
-.Lbase2_32_neon:
- add $in2,$inp,#32
- mov $padbit,$padbit,lsl#24
- tst $len,#31
- beq .Leven
-
- vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
- vmov.32 $H4#lo[0],$padbit
- sub $len,$len,#16
- add $in2,$inp,#32
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3#lo,$H3#lo,#18
-
- vsri.u32 $H3#lo,$H2#lo,#14
- vshl.u32 $H2#lo,$H2#lo,#12
- vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
-
- vbic.i32 $H3#lo,#0xfc000000
- vsri.u32 $H2#lo,$H1#lo,#20
- vshl.u32 $H1#lo,$H1#lo,#6
-
- vbic.i32 $H2#lo,#0xfc000000
- vsri.u32 $H1#lo,$H0#lo,#26
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
-
- vbic.i32 $H0#lo,#0xfc000000
- vbic.i32 $H1#lo,#0xfc000000
- vadd.i32 $H2#hi,$H2#lo,$D2#lo
-
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
-
- mov $tbl1,$zeros
- add $tbl0,$ctx,#48
-
- cmp $len,$len
- b .Long_tail
-
-.align 4
-.Leven:
- subs $len,$len,#64
- it lo
- movlo $in2,$zeros
-
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
- itt hi
- addhi $tbl1,$ctx,#(48+1*9*4)
- addhi $tbl0,$ctx,#(48+3*9*4)
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3,$H3,#18
-
- vsri.u32 $H3,$H2,#14
- vshl.u32 $H2,$H2,#12
-
- vbic.i32 $H3,#0xfc000000
- vsri.u32 $H2,$H1,#20
- vshl.u32 $H1,$H1,#6
-
- vbic.i32 $H2,#0xfc000000
- vsri.u32 $H1,$H0,#26
-
- vbic.i32 $H0,#0xfc000000
- vbic.i32 $H1,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ \___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ \___________________/ \____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
- vmull.u32 $D2,$H2#hi,${R0}[1]
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,${R0}[1]
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,${R0}[1]
- vmlal.u32 $D2,$H1#hi,${R1}[1]
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,${R0}[1]
-
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,${R0}[1]
- subs $len,$len,#64
- vmlal.u32 $D0,$H4#hi,${S1}[1]
- it lo
- movlo $in2,$zeros
- vmlal.u32 $D3,$H2#hi,${R1}[1]
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D1,$H0#hi,${R1}[1]
- vmlal.u32 $D4,$H3#hi,${R1}[1]
-
- vmlal.u32 $D0,$H3#hi,${S2}[1]
- vmlal.u32 $D3,$H1#hi,${R2}[1]
- vmlal.u32 $D4,$H2#hi,${R2}[1]
- vmlal.u32 $D1,$H4#hi,${S2}[1]
- vmlal.u32 $D2,$H0#hi,${R2}[1]
-
- vmlal.u32 $D3,$H0#hi,${R3}[1]
- vmlal.u32 $D0,$H2#hi,${S3}[1]
- vmlal.u32 $D4,$H1#hi,${R3}[1]
- vmlal.u32 $D1,$H3#hi,${S3}[1]
- vmlal.u32 $D2,$H4#hi,${S3}[1]
-
- vmlal.u32 $D3,$H4#hi,${S4}[1]
- vmlal.u32 $D0,$H1#hi,${S4}[1]
- vmlal.u32 $D4,$H0#hi,${R4}[1]
- vmlal.u32 $D1,$H2#hi,${S4}[1]
- vmlal.u32 $D2,$H3#hi,${S4}[1]
-
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 $D3,$H3#lo,${R0}[0]
- vmlal.u32 $D0,$H0#lo,${R0}[0]
- vmlal.u32 $D4,$H4#lo,${R0}[0]
- vmlal.u32 $D1,$H1#lo,${R0}[0]
- vmlal.u32 $D2,$H2#lo,${R0}[0]
- vld1.32 ${S4}[0],[$tbl0,:32]
-
- vmlal.u32 $D3,$H2#lo,${R1}[0]
- vmlal.u32 $D0,$H4#lo,${S1}[0]
- vmlal.u32 $D4,$H3#lo,${R1}[0]
- vmlal.u32 $D1,$H0#lo,${R1}[0]
- vmlal.u32 $D2,$H1#lo,${R1}[0]
-
- vmlal.u32 $D3,$H1#lo,${R2}[0]
- vmlal.u32 $D0,$H3#lo,${S2}[0]
- vmlal.u32 $D4,$H2#lo,${R2}[0]
- vmlal.u32 $D1,$H4#lo,${S2}[0]
- vmlal.u32 $D2,$H0#lo,${R2}[0]
-
- vmlal.u32 $D3,$H0#lo,${R3}[0]
- vmlal.u32 $D0,$H2#lo,${S3}[0]
- vmlal.u32 $D4,$H1#lo,${R3}[0]
- vmlal.u32 $D1,$H3#lo,${S3}[0]
- vmlal.u32 $D3,$H4#lo,${S4}[0]
-
- vmlal.u32 $D2,$H4#lo,${S3}[0]
- vmlal.u32 $D0,$H1#lo,${S4}[0]
- vmlal.u32 $D4,$H0#lo,${R4}[0]
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vmlal.u32 $D1,$H2#lo,${S4}[0]
- vmlal.u32 $D2,$H3#lo,${S4}[0]
-
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
- vrev32.8 $H3,$H3
-# endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26 of
- @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vshl.u32 $H3,$H3,#18
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vsri.u32 $H3,$H2,#14
- vbic.i32 $D4#lo,#0xfc000000
- vshl.u32 $H2,$H2,#12
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vbic.i32 $H3,#0xfc000000
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
- vsri.u32 $H2,$H1,#20
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vshl.u32 $H1,$H1,#6
- vbic.i32 $D2#lo,#0xfc000000
- vbic.i32 $H2,#0xfc000000
-
- vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
- vmovn.i64 $D0#lo,$D0
- vsri.u32 $H1,$H0,#26
- vbic.i32 $H0,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vbic.i32 $D0#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
- vbic.i32 $H1,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add $tbl1,$ctx,#(48+0*9*4)
- add $tbl0,$ctx,#(48+1*9*4)
- adds $len,$len,#32
- it ne
- movne $len,#0
- bne .Long_tail
-
- vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
- vadd.i32 $H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
- vmull.u32 $D2,$H2#hi,$R0
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,$R0
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,$R0
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,$R0
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,$R0
-
- vmlal.u32 $D0,$H4#hi,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#hi,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#hi,$R1
- vmlal.u32 $D4,$H3#hi,$R1
- vmlal.u32 $D2,$H1#hi,$R1
-
- vmlal.u32 $D3,$H1#hi,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#hi,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#hi,$R2
- vmlal.u32 $D1,$H4#hi,$S2
- vmlal.u32 $D2,$H0#hi,$R2
-
- vmlal.u32 $D3,$H0#hi,$R3
- it ne
- addne $tbl1,$ctx,#(48+2*9*4)
- vmlal.u32 $D0,$H2#hi,$S3
- it ne
- addne $tbl0,$ctx,#(48+3*9*4)
- vmlal.u32 $D4,$H1#hi,$R3
- vmlal.u32 $D1,$H3#hi,$S3
- vmlal.u32 $D2,$H4#hi,$S3
-
- vmlal.u32 $D3,$H4#hi,$S4
- vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
- vmlal.u32 $D0,$H1#hi,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#hi,$R4
- vmlal.u32 $D1,$H2#hi,$S4
- vmlal.u32 $D2,$H3#hi,$S4
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-
- vmlal.u32 $D2,$H2#lo,$R0
- vmlal.u32 $D0,$H0#lo,$R0
- vmlal.u32 $D3,$H3#lo,$R0
- vmlal.u32 $D1,$H1#lo,$R0
- vmlal.u32 $D4,$H4#lo,$R0
-
- vmlal.u32 $D0,$H4#lo,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#lo,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#lo,$R1
- vmlal.u32 $D4,$H3#lo,$R1
- vmlal.u32 $D2,$H1#lo,$R1
-
- vmlal.u32 $D3,$H1#lo,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#lo,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#lo,$R2
- vmlal.u32 $D1,$H4#lo,$S2
- vmlal.u32 $D2,$H0#lo,$R2
-
- vmlal.u32 $D3,$H0#lo,$R3
- vmlal.u32 $D0,$H2#lo,$S3
- vmlal.u32 $D4,$H1#lo,$R3
- vmlal.u32 $D1,$H3#lo,$S3
- vmlal.u32 $D2,$H4#lo,$S3
-
- vmlal.u32 $D3,$H4#lo,$S4
- vorn $MASK,$MASK,$MASK @ all-ones
- vmlal.u32 $D0,$H1#lo,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#lo,$R4
- vmlal.u32 $D1,$H2#lo,$S4
- vmlal.u32 $D2,$H3#lo,$S4
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 $T0,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vshr.u64 $T1,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-
- vshr.u64 $T0,$D4,#26
- vand.i64 $D4,$D4,$MASK
- vshr.u64 $T1,$D1,#26
- vand.i64 $D1,$D1,$MASK
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-
- vadd.i64 $D0,$D0,$T0
- vshl.u64 $T0,$T0,#2
- vshr.u64 $T1,$D2,#26
- vand.i64 $D2,$D2,$MASK
- vadd.i64 $D0,$D0,$T0 @ h4 -> h0
- vadd.i64 $D3,$D3,$T1 @ h2 -> h3
-
- vshr.u64 $T0,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vshr.u64 $T1,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vadd.i64 $D1,$D1,$T0 @ h0 -> h1
- vadd.i64 $D4,$D4,$T1 @ h3 -> h4
-
- cmp $len,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- vst1.32 {$D4#lo[0]},[$ctx]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
-.Lno_data_neon:
- ret @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-#ifdef __KERNEL__
-.globl poly1305_emit_neon
-#endif
-.type poly1305_emit_neon,%function
-.align 5
-poly1305_emit_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
-
- stmdb sp!,{r4-r11}
-
- tst ip,ip
- beq .Lpoly1305_emit_enter
-
- ldmia $ctx,{$h0-$h4}
- eor $g0,$g0,$g0
-
- adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $h1,$h1,lsr#6
- adcs $h1,$h1,$h2,lsl#20
- mov $h2,$h2,lsr#12
- adcs $h2,$h2,$h3,lsl#14
- mov $h3,$h3,lsr#18
- adcs $h3,$h3,$h4,lsl#8
- adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
-
- and $g0,$h4,#-4 @ ... so reduce
- and $h4,$h3,#3
- add $g0,$g0,$g0,lsr#2 @ *= 5
- adds $h0,$h0,$g0
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adcs $h3,$h3,#0
- adc $h4,$h4,#0
-
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
- it ne
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
- it ne
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
- it ne
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
- it ne
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0 @ accumulate nonce
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0] @ store the result
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-
- ldmia sp!,{r4-r11}
- ret @ bx lr
-.size poly1305_emit_neon,.-poly1305_emit_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-# ifndef __KERNEL__
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-#endif
-___
-} }
-$code.=<<___;
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl b/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl
deleted file mode 100755
index d513b45a149b..000000000000
--- a/sys/dev/if_wg/module/crypto/zinc/poly1305/poly1305-arm64.pl
+++ /dev/null
@@ -1,974 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc-4.9 NEON
-#
-# Apple A7 1.86/+5% 0.72
-# Cortex-A53 2.69/+58% 1.47
-# Cortex-A57 2.70/+7% 1.14
-# Denver 1.64/+50% 1.18(*)
-# X-Gene 2.13/+68% 2.27
-# Mongoose 1.77/+75% 1.12
-# Kryo 2.70/+55% 1.13
-#
-# (*) estimate based on resources availability is less than 1.0,
-# i.e. measured result is worse than expected, presumably binary
-# translator is not almighty;
-
-$flavour=shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern OPENSSL_armcap_P
-#else
-# define poly1305_init poly1305_init_arm
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit poly1305_emit_arm
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl poly1305_blocks
-.globl poly1305_emit
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
- cmp $inp,xzr
- stp xzr,xzr,[$ctx] // zero hash value
- stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
-#ifndef __KERNEL__
-# ifdef __ILP32__
- ldrsw $t1,.LOPENSSL_armcap_P
-# else
- ldr $t1,.LOPENSSL_armcap_P
-# endif
- adr $t0,.LOPENSSL_armcap_P
- ldr w17,[$t0,$t1]
-#endif
-
- ldp $r0,$r1,[$inp] // load key
- mov $s1,#0xfffffffc0fffffff
- movk $s1,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
- rev $r0,$r0 // flip bytes
- rev $r1,$r1
-#endif
- and $r0,$r0,$s1 // &=0ffffffc0fffffff
- and $s1,$s1,#-4
- and $r1,$r1,$s1 // &=0ffffffc0ffffffc
- stp $r0,$r1,[$ctx,#32] // save key value
-
-#ifndef __KERNEL__
- tst w17,#ARMV7_NEON
-
- adr $d0,poly1305_blocks
- adr $r0,poly1305_blocks_neon
- adr $d1,poly1305_emit
- adr $r1,poly1305_emit_neon
-
- csel $d0,$d0,$r0,eq
- csel $d1,$d1,$r1,eq
-
-# ifdef __ILP32__
- stp w12,w13,[$len]
-# else
- stp $d0,$d1,[$len]
-# endif
-
- mov x0,#1
-#else
- mov x0,#0
-#endif
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
- ands $len,$len,#-16
- b.eq .Lno_data
-
- ldp $h0,$h1,[$ctx] // load hash value
- ldp $r0,$r1,[$ctx,#32] // load key value
- ldr $h2,[$ctx,#16]
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- b .Loop
-
-.align 5
-.Loop:
- ldp $t0,$t1,[$inp],#16 // load input
- sub $len,$len,#16
-#ifdef __AARCH64EB__
- rev $t0,$t0
- rev $t1,$t1
-#endif
- adds $h0,$h0,$t0 // accumulate input
- adcs $h1,$h1,$t1
-
- mul $d0,$h0,$r0 // h0*r0
- adc $h2,$h2,$padbit
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- cbnz $len,.Loop
-
- stp $h0,$h1,[$ctx] // store hash value
- str $h2,[$ctx,#16]
-
-.Lno_data:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
- ldp $h0,$h1,[$ctx] // load hash base 2^64
- ldr $h2,[$ctx,#16]
- ldp $t0,$t1,[$nonce] // load nonce
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __AARCH64EB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __AARCH64EB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros; # borrow
-
-$code.=<<___;
-.type __poly1305_mult,%function
-.align 5
-__poly1305_mult:
- mul $d0,$h0,$r0 // h0*r0
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adcs $h1,$d1,xzr
- adc $h2,$h2,xzr
-
- ret
-.size __poly1305_mult,.-__poly1305_mult
-
-.type __poly1305_splat,%function
-.align 5
-__poly1305_splat:
- and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,$h0,#26,#26
- extr x14,$h1,$h0,#52
- and x14,x14,#0x03ffffff
- ubfx x15,$h1,#14,#26
- extr x16,$h2,$h1,#40
-
- str w12,[$ctx,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[$ctx,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[$ctx,#16*2] // s1
- str w14,[$ctx,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[$ctx,#16*4] // s2
- str w15,[$ctx,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[$ctx,#16*6] // s3
- str w16,[$ctx,#16*7] // r4
- str w15,[$ctx,#16*8] // s4
-
- ret
-.size __poly1305_splat,.-__poly1305_splat
-
-#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON)
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-.globl poly1305_emit_neon
-#endif
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
- ldr $is_base2_26,[$ctx,#24]
- cmp $len,#128
- b.hs .Lblocks_neon
- cbz $is_base2_26,poly1305_blocks
-
-.Lblocks_neon:
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- ands $len,$len,#-16
- b.eq .Lno_data_neon
-
- cbz $is_base2_26,.Lbase2_64_neon
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- tst $len,#31
- b.eq .Leven_neon
-
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $d2,$h2,xzr // can be partially reduced...
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-
- and $t0,$d2,#-4 // ... so reduce
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$h0,$t0
- adcs $h1,$h1,xzr
- adc $h2,$h2,xzr
-
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl __poly1305_mult
- ldr x30,[sp,#8]
-
- cbz $padbit,.Lstore_base2_64_neon
-
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- cbnz $len,.Leven_neon
-
- stp w10,w11,[$ctx] // store hash value base 2^26
- stp w12,w13,[$ctx,#8]
- str w14,[$ctx,#16]
- b .Lno_data_neon
-
-.align 4
-.Lstore_base2_64_neon:
- stp $h0,$h1,[$ctx] // store hash value base 2^64
- stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
- b .Lno_data_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- ldp $h0,$h1,[$ctx] // load hash value base 2^64
- ldr $h2,[$ctx,#16]
-
- tst $len,#31
- b.eq .Linit_neon
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl __poly1305_mult
-
-.Linit_neon:
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
- ////////////////////////////////// initialize r^n table
- mov $h0,$r0 // r^1
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- mov $h1,$r1
- mov $h2,xzr
- add $ctx,$ctx,#48+12
- bl __poly1305_splat
-
- bl __poly1305_mult // r^2
- sub $ctx,$ctx,#4
- bl __poly1305_splat
-
- bl __poly1305_mult // r^3
- sub $ctx,$ctx,#4
- bl __poly1305_splat
-
- bl __poly1305_mult // r^4
- sub $ctx,$ctx,#4
- bl __poly1305_splat
- ldr x30,[sp,#8]
-
- add $in2,$inp,#32
- adr $zeros,.Lzeros
- subs $len,$len,#64
- csel $in2,$zeros,$in2,lo
-
- mov x4,#1
- str x4,[$ctx,#-24] // set is_base2_26
- sub $ctx,$ctx,#48 // restore original $ctx
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- add $in2,$inp,#32
- adr $zeros,.Lzeros
- subs $len,$len,#64
- csel $in2,$zeros,$in2,lo
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
-.Ldo_neon:
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- ldp x9,x13,[$in2],#48
-
- lsl $padbit,$padbit,#24
- add x15,$ctx,#48
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN23_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN23_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov $IN23_2,x8
- fmov $IN23_3,x10
- fmov $IN23_4,x12
-
- ldp x8,x12,[$inp],#16 // inp[0:1]
- ldp x9,x13,[$inp],#48
-
- ld1 {$R0,$R1,$S1,$R2},[x15],#64
- ld1 {$S2,$R3,$S3,$R4},[x15],#64
- ld1 {$S4},[x15]
-
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN01_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN01_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- movi $MASK.2d,#-1
- fmov $IN01_2,x8
- fmov $IN01_3,x10
- fmov $IN01_4,x12
- ushr $MASK.2d,$MASK.2d,#38
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // \___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // \___________________/ \____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs $len,$len,#64
- umull $ACC4,$IN23_0,${R4}[2]
- csel $in2,$zeros,$in2,lo
- umull $ACC3,$IN23_0,${R3}[2]
- umull $ACC2,$IN23_0,${R2}[2]
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- umull $ACC1,$IN23_0,${R1}[2]
- ldp x9,x13,[$in2],#48
- umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal $ACC4,$IN23_1,${R3}[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC3,$IN23_1,${R2}[2]
- and x5,x9,#0x03ffffff
- umlal $ACC2,$IN23_1,${R1}[2]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN23_1,${R0}[2]
- ubfx x7,x9,#26,#26
- umlal $ACC0,$IN23_1,${S4}[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal $ACC4,$IN23_2,${R2}[2]
- extr x8,x12,x8,#52
- umlal $ACC3,$IN23_2,${R1}[2]
- extr x9,x13,x9,#52
- umlal $ACC2,$IN23_2,${R0}[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC1,$IN23_2,${S4}[2]
- fmov $IN23_0,x4
- umlal $ACC0,$IN23_2,${S3}[2]
- and x8,x8,#0x03ffffff
-
- umlal $ACC4,$IN23_3,${R1}[2]
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN23_3,${R0}[2]
- ubfx x10,x12,#14,#26
- umlal $ACC2,$IN23_3,${S4}[2]
- ubfx x11,x13,#14,#26
- umlal $ACC1,$IN23_3,${S3}[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC0,$IN23_3,${S2}[2]
- fmov $IN23_1,x6
-
- add $IN01_2,$IN01_2,$H2
- add x12,$padbit,x12,lsr#40
- umlal $ACC4,$IN23_4,${R0}[2]
- add x13,$padbit,x13,lsr#40
- umlal $ACC3,$IN23_4,${S4}[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC2,$IN23_4,${S3}[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN23_4,${S2}[2]
- fmov $IN23_2,x8
- umlal $ACC0,$IN23_4,${S1}[2]
- fmov $IN23_3,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- fmov $IN23_4,x12
- umlal $ACC3,$IN01_2,${R1}[0]
- ldp x8,x12,[$inp],#16 // inp[0:1]
- umlal $ACC0,$IN01_2,${S3}[0]
- ldp x9,x13,[$inp],#48
- umlal $ACC4,$IN01_2,${R2}[0]
- umlal $ACC1,$IN01_2,${S4}[0]
- umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __AARCH64EB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}[0]
- umlal $ACC4,$IN01_0,${R4}[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC2,$IN01_0,${R2}[0]
- and x5,x9,#0x03ffffff
- umlal $ACC0,$IN01_0,${R0}[0]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN01_0,${R1}[0]
- ubfx x7,x9,#26,#26
-
- add $IN01_3,$IN01_3,$H3
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal $ACC3,$IN01_1,${R2}[0]
- extr x8,x12,x8,#52
- umlal $ACC4,$IN01_1,${R3}[0]
- extr x9,x13,x9,#52
- umlal $ACC0,$IN01_1,${S4}[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC2,$IN01_1,${R1}[0]
- fmov $IN01_0,x4
- umlal $ACC1,$IN01_1,${R0}[0]
- and x8,x8,#0x03ffffff
-
- add $IN01_4,$IN01_4,$H4
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN01_3,${R0}[0]
- ubfx x10,x12,#14,#26
- umlal $ACC0,$IN01_3,${S2}[0]
- ubfx x11,x13,#14,#26
- umlal $ACC4,$IN01_3,${R1}[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC1,$IN01_3,${S3}[0]
- fmov $IN01_1,x6
- umlal $ACC2,$IN01_3,${S4}[0]
- add x12,$padbit,x12,lsr#40
-
- umlal $ACC3,$IN01_4,${S4}[0]
- add x13,$padbit,x13,lsr#40
- umlal $ACC0,$IN01_4,${S1}[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC4,$IN01_4,${R0}[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN01_4,${S2}[0]
- fmov $IN01_2,x8
- umlal $ACC2,$IN01_4,${S3}[0]
- fmov $IN01_3,x10
- fmov $IN01_4,x12
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
- //
- // [see discussion in poly1305-armv4 module]
-
- ushr $T0.2d,$ACC3,#26
- xtn $H3,$ACC3
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- bic $H3,#0xfc,lsl#24 // &=0x03ffffff
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- xtn $H4,$ACC4
- ushr $T1.2d,$ACC1,#26
- xtn $H1,$ACC1
- bic $H4,#0xfc,lsl#24
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- shrn $T1.2s,$ACC2,#26
- xtn $H2,$ACC2
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- bic $H1,#0xfc,lsl#24
- add $H3,$H3,$T1.2s // h2 -> h3
- bic $H2,#0xfc,lsl#24
-
- shrn $T0.2s,$ACC0,#26
- xtn $H0,$ACC0
- ushr $T1.2s,$H3,#26
- bic $H3,#0xfc,lsl#24
- bic $H0,#0xfc,lsl#24
- add $H1,$H1,$T0.2s // h0 -> h1
- add $H4,$H4,$T1.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup $IN23_2,${IN23_2}[0]
- add $IN01_2,$IN01_2,$H2
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds $len,$len,#32
- b.ne .Long_tail
-
- dup $IN23_2,${IN01_2}[0]
- add $IN23_0,$IN01_0,$H0
- add $IN23_3,$IN01_3,$H3
- add $IN23_1,$IN01_1,$H1
- add $IN23_4,$IN01_4,$H4
-
-.Long_tail:
- dup $IN23_0,${IN23_0}[0]
- umull2 $ACC0,$IN23_2,${S3}
- umull2 $ACC3,$IN23_2,${R1}
- umull2 $ACC4,$IN23_2,${R2}
- umull2 $ACC2,$IN23_2,${R0}
- umull2 $ACC1,$IN23_2,${S4}
-
- dup $IN23_1,${IN23_1}[0]
- umlal2 $ACC0,$IN23_0,${R0}
- umlal2 $ACC2,$IN23_0,${R2}
- umlal2 $ACC3,$IN23_0,${R3}
- umlal2 $ACC4,$IN23_0,${R4}
- umlal2 $ACC1,$IN23_0,${R1}
-
- dup $IN23_3,${IN23_3}[0]
- umlal2 $ACC0,$IN23_1,${S4}
- umlal2 $ACC3,$IN23_1,${R2}
- umlal2 $ACC2,$IN23_1,${R1}
- umlal2 $ACC4,$IN23_1,${R3}
- umlal2 $ACC1,$IN23_1,${R0}
-
- dup $IN23_4,${IN23_4}[0]
- umlal2 $ACC3,$IN23_3,${R0}
- umlal2 $ACC4,$IN23_3,${R1}
- umlal2 $ACC0,$IN23_3,${S2}
- umlal2 $ACC1,$IN23_3,${S3}
- umlal2 $ACC2,$IN23_3,${S4}
-
- umlal2 $ACC3,$IN23_4,${S4}
- umlal2 $ACC0,$IN23_4,${S1}
- umlal2 $ACC4,$IN23_4,${R0}
- umlal2 $ACC1,$IN23_4,${S2}
- umlal2 $ACC2,$IN23_4,${S3}
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- umlal $ACC3,$IN01_2,${R1}
- umlal $ACC0,$IN01_2,${S3}
- umlal $ACC4,$IN01_2,${R2}
- umlal $ACC1,$IN01_2,${S4}
- umlal $ACC2,$IN01_2,${R0}
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}
- umlal $ACC0,$IN01_0,${R0}
- umlal $ACC4,$IN01_0,${R4}
- umlal $ACC1,$IN01_0,${R1}
- umlal $ACC2,$IN01_0,${R2}
-
- add $IN01_3,$IN01_3,$H3
- umlal $ACC3,$IN01_1,${R2}
- umlal $ACC0,$IN01_1,${S4}
- umlal $ACC4,$IN01_1,${R3}
- umlal $ACC1,$IN01_1,${R0}
- umlal $ACC2,$IN01_1,${R1}
-
- add $IN01_4,$IN01_4,$H4
- umlal $ACC3,$IN01_3,${R0}
- umlal $ACC0,$IN01_3,${S2}
- umlal $ACC4,$IN01_3,${R1}
- umlal $ACC1,$IN01_3,${S3}
- umlal $ACC2,$IN01_3,${S4}
-
- umlal $ACC3,$IN01_4,${S4}
- umlal $ACC0,$IN01_4,${S1}
- umlal $ACC4,$IN01_4,${R0}
- umlal $ACC1,$IN01_4,${S2}
- umlal $ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp $ACC3,$ACC3,$ACC3
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp $ACC0,$ACC0,$ACC0
- ldp d10,d11,[sp,#32]
- addp $ACC4,$ACC4,$ACC4
- ldp d12,d13,[sp,#48]
- addp $ACC1,$ACC1,$ACC1
- ldp d14,d15,[sp,#64]
- addp $ACC2,$ACC2,$ACC2
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr $T0.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
-
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- and $ACC4,$ACC4,$MASK.2d
- ushr $T1.2d,$ACC1,#26
- and $ACC1,$ACC1,$MASK.2d
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- ushr $T1.2d,$ACC2,#26
- and $ACC2,$ACC2,$MASK.2d
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- add $ACC3,$ACC3,$T1.2d // h2 -> h3
-
- ushr $T0.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- ushr $T1.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- add $ACC1,$ACC1,$T0.2d // h0 -> h1
- add $ACC4,$ACC4,$T1.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
- st1 {$ACC4}